aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Kconfig76
-rw-r--r--fs/ocfs2/Makefile31
-rw-r--r--fs/ocfs2/acl.c312
-rw-r--r--fs/ocfs2/acl.h39
-rw-r--r--fs/ocfs2/alloc.c7540
-rw-r--r--fs/ocfs2/alloc.h278
-rw-r--r--fs/ocfs2/aops.c2032
-rw-r--r--fs/ocfs2/aops.h76
-rw-r--r--fs/ocfs2/blockcheck.c647
-rw-r--r--fs/ocfs2/blockcheck.h107
-rw-r--r--fs/ocfs2/buffer_head_io.c261
-rw-r--r--fs/ocfs2/buffer_head_io.h42
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/endian.h30
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1423
-rw-r--r--fs/ocfs2/cluster/heartbeat.h15
-rw-r--r--fs/ocfs2/cluster/masklog.c28
-rw-r--r--fs/ocfs2/cluster/masklog.h154
-rw-r--r--fs/ocfs2/cluster/netdebug.c579
-rw-r--r--fs/ocfs2/cluster/nodemanager.c410
-rw-r--r--fs/ocfs2/cluster/nodemanager.h32
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/quorum.c32
-rw-r--r--fs/ocfs2/cluster/sys.c84
-rw-r--r--fs/ocfs2/cluster/tcp.c913
-rw-r--r--fs/ocfs2/cluster/tcp.h48
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h106
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c165
-rw-r--r--fs/ocfs2/dcache.h3
-rw-r--r--fs/ocfs2/dir.c4370
-rw-r--r--fs/ocfs2/dir.h86
-rw-r--r--fs/ocfs2/dlm/Makefile7
-rw-r--r--fs/ocfs2/dlm/dlmapi.h9
-rw-r--r--fs/ocfs2/dlm/dlmast.c151
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h350
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c72
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c923
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h81
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1204
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlm/dlmlock.c101
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1636
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c741
-rw-r--r--fs/ocfs2/dlm/dlmthread.c378
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c62
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)285
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)317
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)17
-rw-r--r--fs/ocfs2/dlmglue.c2388
-rw-r--r--fs/ocfs2/dlmglue.h114
-rw-r--r--fs/ocfs2/endian.h45
-rw-r--r--fs/ocfs2/export.c247
-rw-r--r--fs/ocfs2/export.h4
-rw-r--r--fs/ocfs2/extent_map.c1585
-rw-r--r--fs/ocfs2/extent_map.h78
-rw-r--r--fs/ocfs2/file.c2623
-rw-r--r--fs/ocfs2/file.h55
-rw-r--r--fs/ocfs2/heartbeat.c283
-rw-r--r--fs/ocfs2/heartbeat.h24
-rw-r--r--fs/ocfs2/inode.c1043
-rw-r--r--fs/ocfs2/inode.h122
-rw-r--r--fs/ocfs2/ioctl.c966
-rw-r--r--fs/ocfs2/ioctl.h10
-rw-r--r--fs/ocfs2/journal.c1805
-rw-r--r--fs/ocfs2/journal.h447
-rw-r--r--fs/ocfs2/localalloc.c783
-rw-r--r--fs/ocfs2/localalloc.h18
-rw-r--r--fs/ocfs2/locks.c141
-rw-r--r--fs/ocfs2/locks.h (renamed from fs/ocfs2/ver.h)13
-rw-r--r--fs/ocfs2/mmap.c162
-rw-r--r--fs/ocfs2/move_extents.c1078
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/namei.c2524
-rw-r--r--fs/ocfs2/namei.h29
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h2
-rw-r--r--fs/ocfs2/ocfs2.h598
-rw-r--r--fs/ocfs2/ocfs2_fs.h1069
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h242
-rw-r--r--fs/ocfs2/ocfs2_lockid.h32
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h32
-rw-r--r--fs/ocfs2/ocfs2_trace.h2768
-rw-r--r--fs/ocfs2/quota.h119
-rw-r--r--fs/ocfs2/quota_global.c971
-rw-r--r--fs/ocfs2/quota_local.c1320
-rw-r--r--fs/ocfs2/refcounttree.c4476
-rw-r--r--fs/ocfs2/refcounttree.h118
-rw-r--r--fs/ocfs2/reservations.c839
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c589
-rw-r--r--fs/ocfs2/resize.h (renamed from fs/ocfs2/dlm/dlmfsver.h)13
-rw-r--r--fs/ocfs2/slot_map.c477
-rw-r--r--fs/ocfs2/slot_map.h34
-rw-r--r--fs/ocfs2/stack_o2cb.c449
-rw-r--r--fs/ocfs2/stack_user.c1136
-rw-r--r--fs/ocfs2/stackglue.c748
-rw-r--r--fs/ocfs2/stackglue.h301
-rw-r--r--fs/ocfs2/suballoc.c2373
-rw-r--r--fs/ocfs2/suballoc.h147
-rw-r--r--fs/ocfs2/super.c1802
-rw-r--r--fs/ocfs2/super.h21
-rw-r--r--fs/ocfs2/symlink.c143
-rw-r--r--fs/ocfs2/symlink.h4
-rw-r--r--fs/ocfs2/sysfile.c93
-rw-r--r--fs/ocfs2/uptodate.c335
-rw-r--r--fs/ocfs2/uptodate.h52
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/vote.c1027
-rw-r--r--fs/ocfs2/vote.h51
-rw-r--r--fs/ocfs2/xattr.c7427
-rw-r--r--fs/ocfs2/xattr.h100
115 files changed, 60420 insertions, 12753 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 00000000000..77a8de5f711
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,76 @@
+config OCFS2_FS
+ tristate "OCFS2 file system support"
+ depends on NET && SYSFS && CONFIGFS_FS
+ select JBD2
+ select CRC32
+ select QUOTA
+ select QUOTA_TREE
+ select FS_POSIX_ACL
+ help
+ OCFS2 is a general purpose extent based shared disk cluster file
+ system with many similarities to ext3. It supports 64 bit inode
+ numbers, and has automatically extending metadata groups which may
+ also make it attractive for non-clustered use.
+
+ You'll want to install the ocfs2-tools package in order to at least
+ get "mount.ocfs2".
+
+ Project web page: http://oss.oracle.com/projects/ocfs2
+ Tools web page: http://oss.oracle.com/projects/ocfs2-tools
+ OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+ For more information on OCFS2, see the file
+ <file:Documentation/filesystems/ocfs2.txt>.
+
+config OCFS2_FS_O2CB
+ tristate "O2CB Kernelspace Clustering"
+ depends on OCFS2_FS
+ default y
+ help
+ OCFS2 includes a simple kernelspace clustering package, the OCFS2
+ Cluster Base. It only requires a very small userspace component
+ to configure it. This comes with the standard ocfs2-tools package.
+ O2CB is limited to maintaining a cluster for OCFS2 file systems.
+ It cannot manage any other cluster applications.
+
+ It is always safe to say Y here, as the clustering method is
+ run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+ tristate "OCFS2 Userspace Clustering"
+ depends on OCFS2_FS && DLM
+ default y
+ help
+ This option will allow OCFS2 to use userspace clustering services
+ in conjunction with the DLM in fs/dlm. If you are using a
+ userspace cluster manager, say Y here.
+
+ It is safe to say Y, as the clustering method is run-time
+ selectable.
+
+config OCFS2_FS_STATS
+ bool "OCFS2 statistics"
+ depends on OCFS2_FS && DEBUG_FS
+ default y
+ help
+ This option allows some fs statistics to be captured. Enabling
+ this option may increase the memory consumption.
+
+config OCFS2_DEBUG_MASKLOG
+ bool "OCFS2 logging support"
+ depends on OCFS2_FS
+ default y
+ help
+ The ocfs2 filesystem has an extensive logging system. The system
+ allows selection of events to log via files in /sys/o2cb/logmask/.
+ This option will enlarge your kernel, but it allows debugging of
+ ocfs2 filesystem issues.
+
+config OCFS2_DEBUG_FS
+ bool "OCFS2 expensive checks"
+ depends on OCFS2_FS
+ default n
+ help
+ This option will enable expensive consistency checks. Enable
+ this option for debugging only as it is likely to decrease
+ performance of the filesystem.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 9fb8132f19b..ce210d4951a 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,12 +1,18 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+ccflags-y += -DCATCH_BH_JBD_RACES
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+obj-$(CONFIG_OCFS2_FS) += \
+ ocfs2.o \
+ ocfs2_stackglue.o
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
ocfs2-objs := \
alloc.o \
aops.o \
+ blockcheck.o \
buffer_head_io.o \
dcache.o \
dir.o \
@@ -19,16 +25,29 @@ ocfs2-objs := \
ioctl.o \
journal.o \
localalloc.o \
+ locks.o \
mmap.o \
namei.o \
+ refcounttree.o \
+ reservations.o \
+ move_extents.o \
+ resize.o \
slot_map.o \
suballoc.o \
super.o \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
- vote.o
+ quota_local.o \
+ quota_global.o \
+ xattr.o \
+ acl.o
+
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
+# cluster/ is always needed when OCFS2_FS for masklog support
obj-$(CONFIG_OCFS2_FS) += cluster/
-obj-$(CONFIG_OCFS2_FS) += dlm/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 00000000000..7e8282dcea2
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,312 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+ int n, count;
+ struct posix_acl *acl;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(struct posix_acl_entry))
+ return ERR_PTR(-EINVAL);
+
+ count = size / sizeof(struct posix_acl_entry);
+
+ acl = posix_acl_alloc(count, GFP_NOFS);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ for (n = 0; n < count; n++) {
+ struct ocfs2_acl_entry *entry =
+ (struct ocfs2_acl_entry *)value;
+
+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ case ACL_GROUP:
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ default:
+ break;
+ }
+ value += sizeof(struct posix_acl_entry);
+
+ }
+ return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+ struct ocfs2_acl_entry *entry = NULL;
+ char *ocfs2_acl;
+ size_t n;
+
+ *size = acl->a_count * sizeof(struct posix_acl_entry);
+
+ ocfs2_acl = kmalloc(*size, GFP_NOFS);
+ if (!ocfs2_acl)
+ return ERR_PTR(-ENOMEM);
+
+ entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+ for (n = 0; n < acl->a_count; n++, entry++) {
+ entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns,
+ acl->a_entries[n].e_uid));
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns,
+ acl->a_entries[n].e_gid));
+ break;
+ default:
+ entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
+ }
+ return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+ int type,
+ struct buffer_head *di_bh)
+{
+ int name_index;
+ char *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+ "", value, retval);
+ }
+
+ if (retval > 0)
+ acl = ocfs2_acl_from_xattr(value, retval);
+ else if (retval == -ENODATA || retval == 0)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+
+ kfree(value);
+
+ return acl;
+}
+
+/*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, umode_t new_mode)
+{
+ int ret, commit_handle = 0;
+ struct ocfs2_dinode *di;
+
+ if (di_bh == NULL) {
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else
+ get_bh(di_bh);
+
+ if (handle == NULL) {
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_brelse;
+ }
+
+ commit_handle = 1;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ inode->i_mode = new_mode;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_mode = cpu_to_le16(inode->i_mode);
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ if (commit_handle)
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ umode_t mode = inode->i_mode;
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ else {
+ if (ret == 0)
+ acl = NULL;
+
+ ret = ocfs2_acl_set_mode(inode, di_bh,
+ handle, mode);
+ if (ret)
+ return ret;
+
+ }
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ value = ocfs2_acl_to_xattr(acl, &size);
+ if (IS_ERR(value))
+ return (int)PTR_ERR(value);
+ }
+
+ if (handle)
+ ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+ "", value, size, 0,
+ meta_ac, data_ac);
+ else
+ ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+ kfree(value);
+
+ return ret;
+}
+
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+}
+
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
+{
+ struct ocfs2_super *osb;
+ struct buffer_head *di_bh = NULL;
+ struct posix_acl *acl;
+ int ret = -EAGAIN;
+
+ osb = OCFS2_SB(inode->i_sb);
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return NULL;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+ brelse(di_bh);
+
+ return acl;
+}
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 00000000000..3fce68d0862
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+};
+
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac);
+
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f43bc5f18a3..9d8fcf2f3b9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,13 +27,17 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "aops.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
@@ -44,105 +48,944 @@
#include "file.h"
#include "super.h"
#include "uptodate.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-static int ocfs2_extent_contig(struct inode *inode,
- struct ocfs2_extent_rec *ext,
- u64 blkno);
+enum ocfs2_contig_type {
+ CONTIG_NONE = 0,
+ CONTIG_LEFT,
+ CONTIG_RIGHT,
+ CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_contig_type
+ ocfs2_extent_rec_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec);
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+ /*
+ * last_eb_blk is the block number of the right most leaf extent
+ * block. Most on-disk structures containing an extent tree store
+ * this value for fast access. The ->eo_set_last_eb_blk() and
+ * ->eo_get_last_eb_blk() operations access this value. They are
+ * both required.
+ */
+ void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+ u64 blkno);
+ u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- int wanted,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head *bhs[]);
+ /*
+ * The on-disk structure usually keeps track of how many total
+ * clusters are stored in this extent tree. This function updates
+ * that value. new_clusters is the delta, and must be
+ * added to the total. Required.
+ */
+ void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
+ u32 new_clusters);
-static int ocfs2_add_branch(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct buffer_head *eb_bh,
- struct buffer_head *last_eb_bh,
- struct ocfs2_alloc_context *meta_ac);
+ /*
+ * If this extent tree is supported by an extent map, insert
+ * a record into the map.
+ */
+ void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head **ret_new_eb_bh);
+ /*
+ * If this extent tree is supported by an extent map, truncate the
+ * map to clusters,
+ */
+ void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
+ u32 clusters);
+
+ /*
+ * If ->eo_insert_check() exists, it is called before rec is
+ * inserted into the extent tree. It is optional.
+ */
+ int (*eo_insert_check)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+ int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
+
+ /*
+ * --------------------------------------------------------------
+ * The remaining are internal to ocfs2_extent_tree and don't have
+ * accessor functions
+ */
+
+ /*
+ * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+ * It is required.
+ */
+ void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+ /*
+ * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+ * it exists. If it does not, et->et_max_leaf_clusters is set
+ * to 0 (unlimited). Optional.
+ */
+ void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+ /*
+ * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
+ * are contiguous or not. Optional. Don't need to set it if use
+ * ocfs2_extent_rec as the tree leaf.
+ */
+ enum ocfs2_contig_type
+ (*eo_extent_contig)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno);
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters);
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters);
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_dinode_update_clusters,
+ .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
+ .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
+ .eo_insert_check = ocfs2_dinode_insert_check,
+ .eo_sanity_check = ocfs2_dinode_sanity_check,
+ .eo_fill_root_el = ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+ struct ocfs2_dinode *di = et->et_object;
+
+ le32_add_cpu(&di->i_clusters, clusters);
+ spin_lock(&oi->ip_lock);
+ oi->ip_clusters = le32_to_cpu(di->i_clusters);
+ spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+ ocfs2_extent_map_insert_rec(inode, rec);
+}
+
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+ ocfs2_extent_map_trunc(inode, clusters);
+}
+
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+ struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+ mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+ (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
+ "Device %s, asking for sparse allocation: inode %llu, "
+ "cpos %u, clusters %u\n",
+ osb->dev_str,
+ (unsigned long long)oi->ip_blkno,
+ rec->e_cpos, oi->ip_clusters);
+
+ return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ return 0;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 blkno,
- u32 new_clusters);
+ et->et_root_el = &di->id2.i_list;
+}
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct buffer_head **target_bh);
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_dinode *fe,
- unsigned int new_i_clusters,
- struct buffer_head *old_last_eb,
- struct buffer_head **new_last_eb);
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+ et->et_root_el = &vb->vb_xv->xr_list;
+}
-static int ocfs2_extent_contig(struct inode *inode,
- struct ocfs2_extent_rec *ext,
- u64 blkno)
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
{
- return blkno == (le64_to_cpu(ext->e_blkno) +
- ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(ext->e_clusters)));
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+ vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+ return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
+
+ le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_xattr_value_update_clusters,
+ .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+
+ et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
+{
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ et->et_max_leaf_clusters =
+ ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+ xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+ return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+
+ le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
+ .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
+ .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+ dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+ return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+
+static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+ le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+
+static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+ BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+
+ return 0;
+}
+
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+ et->et_root_el = &dx_root->dr_list;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_dx_root_update_clusters,
+ .eo_sanity_check = ocfs2_dx_root_sanity_check,
+ .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
+};
+
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
+ .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
+ .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh,
+ ocfs2_journal_access_func access,
+ void *obj,
+ struct ocfs2_extent_tree_operations *ops)
+{
+ et->et_ops = ops;
+ et->et_root_bh = bh;
+ et->et_ci = ci;
+ et->et_root_journal_access = access;
+ if (!obj)
+ obj = (void *)bh->b_data;
+ et->et_object = obj;
+
+ et->et_ops->eo_fill_root_el(et);
+ if (!et->et_ops->eo_fill_max_leaf_clusters)
+ et->et_max_leaf_clusters = 0;
+ else
+ et->et_ops->eo_fill_max_leaf_clusters(et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
+ NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
+ NULL, &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
+ &ocfs2_xattr_value_et_ops);
+}
+
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
+ NULL, &ocfs2_dx_root_et_ops);
+}
+
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+ NULL, &ocfs2_refcount_tree_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 new_last_eb_blk)
+{
+ et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ et->et_ops->eo_update_clusters(et, clusters);
+}
+
+static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ if (et->et_ops->eo_extent_map_insert)
+ et->et_ops->eo_extent_map_insert(et, rec);
+}
+
+static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ if (et->et_ops->eo_extent_map_truncate)
+ et->et_ops->eo_extent_map_truncate(et, clusters);
+}
+
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ int type)
+{
+ return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
+ type);
+}
+
+static inline enum ocfs2_contig_type
+ ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ if (et->et_ops->eo_extent_contig)
+ return et->et_ops->eo_extent_contig(et, rec, insert_rec);
+
+ return ocfs2_extent_rec_contig(
+ ocfs2_metadata_cache_get_super(et->et_ci),
+ rec, insert_rec);
+}
+
+static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ int ret = 0;
+
+ if (et->et_ops->eo_insert_check)
+ ret = et->et_ops->eo_insert_check(et, rec);
+ return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
+{
+ int ret = 0;
+
+ if (et->et_ops->eo_sanity_check)
+ ret = et->et_ops->eo_sanity_check(et);
+ return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ struct ocfs2_extent_block *eb);
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_rec *insert_rec);
+/*
+ * Reset the actual path elements so that we can re-use the structure
+ * to build another path. Generally, this involves freeing the buffer
+ * heads.
+ */
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+ int i, start = 0, depth = 0;
+ struct ocfs2_path_item *node;
+
+ if (keep_root)
+ start = 1;
+
+ for(i = start; i < path_num_items(path); i++) {
+ node = &path->p_node[i];
+
+ brelse(node->bh);
+ node->bh = NULL;
+ node->el = NULL;
+ }
+
+ /*
+ * Tree depth may change during truncate, or insert. If we're
+ * keeping the root extent list, then make sure that our path
+ * structure reflects the proper depth.
+ */
+ if (keep_root)
+ depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+ else
+ path_root_access(path) = NULL;
+
+ path->p_tree_depth = depth;
+}
+
+void ocfs2_free_path(struct ocfs2_path *path)
+{
+ if (path) {
+ ocfs2_reinit_path(path, 0);
+ kfree(path);
+ }
+}
+
+/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+ int i;
+
+ BUG_ON(path_root_bh(dest) != path_root_bh(src));
+ BUG_ON(path_root_el(dest) != path_root_el(src));
+ BUG_ON(path_root_access(dest) != path_root_access(src));
+
+ ocfs2_reinit_path(dest, 1);
+
+ for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+ dest->p_node[i].bh = src->p_node[i].bh;
+ dest->p_node[i].el = src->p_node[i].el;
+
+ if (dest->p_node[i].bh)
+ get_bh(dest->p_node[i].bh);
+ }
+}
+
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+ int i;
+
+ BUG_ON(path_root_bh(dest) != path_root_bh(src));
+ BUG_ON(path_root_access(dest) != path_root_access(src));
+
+ for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+ brelse(dest->p_node[i].bh);
+
+ dest->p_node[i].bh = src->p_node[i].bh;
+ dest->p_node[i].el = src->p_node[i].el;
+
+ src->p_node[i].bh = NULL;
+ src->p_node[i].el = NULL;
+ }
+}
+
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+ struct buffer_head *eb_bh)
+{
+ struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+ /*
+ * Right now, no root bh is an extent block, so this helps
+ * catch code errors with dinode trees. The assertion can be
+ * safely removed if we ever need to insert extent block
+ * structures at the root.
+ */
+ BUG_ON(index == 0);
+
+ path->p_node[index].bh = eb_bh;
+ path->p_node[index].el = &eb->h_list;
+}
+
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+ struct ocfs2_extent_list *root_el,
+ ocfs2_journal_access_func access)
+{
+ struct ocfs2_path *path;
+
+ BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+
+ path = kzalloc(sizeof(*path), GFP_NOFS);
+ if (path) {
+ path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+ get_bh(root_bh);
+ path_root_bh(path) = root_bh;
+ path_root_el(path) = root_el;
+ path_root_access(path) = access;
+ }
+
+ return path;
+}
+
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+ return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+ path_root_access(path));
+}
+
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+ return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+ et->et_root_journal_access);
}
/*
+ * Journal the buffer at depth idx. All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ int idx)
+{
+ ocfs2_journal_access_func access = path_root_access(path);
+
+ if (!access)
+ access = ocfs2_journal_access;
+
+ if (idx)
+ access = ocfs2_journal_access_eb;
+
+ return access(handle, ci, path->p_node[idx].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
+/*
+ * Convenience function to journal all components in a path.
+ */
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+ handle_t *handle,
+ struct ocfs2_path *path)
+{
+ int i, ret = 0;
+
+ if (!path)
+ goto out;
+
+ for(i = 0; i < path_num_items(path); i++) {
+ ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+ int ret = -1;
+ int i;
+ struct ocfs2_extent_rec *rec;
+ u32 rec_end, rec_start, clusters;
+
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+
+ rec_start = le32_to_cpu(rec->e_cpos);
+ clusters = ocfs2_rec_clusters(el, rec);
+
+ rec_end = rec_start + clusters;
+
+ if (v_cluster >= rec_start && v_cluster < rec_end) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ u64 blkno)
+{
+ u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+ blk_end += ocfs2_clusters_to_blocks(sb,
+ le16_to_cpu(ext->e_leaf_clusters));
+
+ return blkno == blk_end;
+}
+
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+ struct ocfs2_extent_rec *right)
+{
+ u32 left_range;
+
+ left_range = le32_to_cpu(left->e_cpos) +
+ le16_to_cpu(left->e_leaf_clusters);
+
+ return (left_range == le32_to_cpu(right->e_cpos));
+}
+
+static enum ocfs2_contig_type
+ ocfs2_extent_rec_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+
+ /*
+ * Refuse to coalesce extent records with different flag
+ * fields - we don't want to mix unwritten extents with user
+ * data.
+ */
+ if (ext->e_flags != insert_rec->e_flags)
+ return CONTIG_NONE;
+
+ if (ocfs2_extents_adjacent(ext, insert_rec) &&
+ ocfs2_block_extent_contig(sb, ext, blkno))
+ return CONTIG_RIGHT;
+
+ blkno = le64_to_cpu(ext->e_blkno);
+ if (ocfs2_extents_adjacent(insert_rec, ext) &&
+ ocfs2_block_extent_contig(sb, insert_rec, blkno))
+ return CONTIG_LEFT;
+
+ return CONTIG_NONE;
+}
+
+/*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+ APPEND_NONE = 0,
+ APPEND_TAIL,
+};
+
+enum ocfs2_split_type {
+ SPLIT_NONE = 0,
+ SPLIT_LEFT,
+ SPLIT_RIGHT,
+};
+
+struct ocfs2_insert_type {
+ enum ocfs2_split_type ins_split;
+ enum ocfs2_append_type ins_appending;
+ enum ocfs2_contig_type ins_contig;
+ int ins_contig_index;
+ int ins_tree_depth;
+};
+
+struct ocfs2_merge_ctxt {
+ enum ocfs2_contig_type c_contig_type;
+ int c_has_empty_extent;
+ int c_split_covers_rec;
+};
+
+static int ocfs2_validate_extent_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_extent_block *eb =
+ (struct ocfs2_extent_block *)bh->b_data;
+
+ trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return rc;
+ }
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Extent block #%llu has an invalid "
+ "h_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(ci, eb_blkno, &tmp,
+ ocfs2_validate_extent_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+
+/*
* How many free extents have we got before we need more meta data?
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_dinode *fe)
+ struct ocfs2_extent_tree *et)
{
int retval;
- struct ocfs2_extent_list *el;
+ struct ocfs2_extent_list *el = NULL;
struct ocfs2_extent_block *eb;
struct buffer_head *eb_bh = NULL;
+ u64 last_eb_blk = 0;
- mlog_entry_void();
+ el = et->et_root_el;
+ last_eb_blk = ocfs2_et_get_last_eb_blk(et);
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- retval = -EIO;
- goto bail;
- }
-
- if (fe->i_last_eb_blk) {
- retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &eb_bh, OCFS2_BH_CACHED, inode);
+ if (last_eb_blk) {
+ retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
+ &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
- } else
- el = &fe->id2.i_list;
+ }
BUG_ON(el->l_tree_depth != 0);
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
- if (eb_bh)
- brelse(eb_bh);
+ brelse(eb_bh);
- mlog_exit(retval);
+ trace_ocfs2_num_free_extents(retval);
return retval;
}
@@ -151,9 +994,8 @@ bail:
* sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
* l_count for you
*/
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
+static int ocfs2_create_new_meta_bhs(handle_t *handle,
+ struct ocfs2_extent_tree *et,
int wanted,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head *bhs[])
@@ -161,17 +1003,17 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
int count, status, i;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
struct ocfs2_extent_block *eb;
- mlog_entry_void();
-
count = 0;
while (count < wanted) {
- status = ocfs2_claim_metadata(osb,
- handle,
+ status = ocfs2_claim_metadata(handle,
meta_ac,
wanted - count,
+ &suballoc_loc,
&suballoc_bit_start,
&num_got,
&first_blkno);
@@ -183,14 +1025,15 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
for(i = count; i < (num_got + count); i++) {
bhs[i] = sb_getblk(osb->sb, first_blkno);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+ ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
- status = ocfs2_journal_access(handle, inode, bhs[i],
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, et->et_ci,
+ bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -202,13 +1045,9 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
eb->h_blkno = cpu_to_le64(first_blkno);
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
- /* we always use slot zero's suballocator */
- eb->h_suballoc_slot = 0;
-#else
- eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
-#endif
+ eb->h_suballoc_slot =
+ cpu_to_le16(meta_ac->ac_alloc_slot);
+ eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -218,11 +1057,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
/* We'll also be dirtied by the caller, so
* this isn't absolutely necessary. */
- status = ocfs2_journal_dirty(handle, bhs[i]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[i]);
}
count += num_got;
@@ -232,60 +1067,145 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
bail:
if (status < 0) {
for(i = 0; i < wanted; i++) {
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
bhs[i] = NULL;
}
+ mlog_errno(status);
+ }
+ return status;
+}
+
+/*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
+{
+ int i;
+
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+ return le32_to_cpu(el->l_recs[i].e_cpos) +
+ ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+
+/*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+ struct ocfs2_extent_tree *et)
+{
+ int status;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ status = -ENOMEM;
+ return status;
}
- mlog_exit(status);
+
+ status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_extend_trans(handle, path_num_items(path));
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
+
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
+
+out:
+ ocfs2_free_path(path);
return status;
}
/*
* Add an entire tree branch to our inode. eb_bh is the extent block
- * to start at, if we don't want to start the branch at the dinode
+ * to start at, if we don't want to start the branch at the root
* structure.
*
* last_eb_bh is required as we have to update it's next_leaf pointer
* for the new last extent block.
*
* the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
*/
-static int ocfs2_add_branch(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
+static int ocfs2_add_branch(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct buffer_head *eb_bh,
- struct buffer_head *last_eb_bh,
+ struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int status, new_blocks, i;
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el;
+ u32 new_cpos, root_end;
- mlog_entry_void();
-
- BUG_ON(!last_eb_bh);
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ BUG_ON(!last_eb_bh || !*last_eb_bh);
if (eb_bh) {
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
} else
- el = &fe->id2.i_list;
+ el = et->et_root_el;
/* we never add a branch to a leaf. */
BUG_ON(!el->l_tree_depth);
new_blocks = le16_to_cpu(el->l_tree_depth);
+ eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+ new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+ root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+
+ /*
+ * If there is a gap before the root end and the real end
+ * of the righmost leaf block, we need to remove the gap
+ * between new_cpos and root_end first so that the tree
+ * is consistent after we add a new branch(it will start
+ * from new_cpos).
+ */
+ if (root_end > new_cpos) {
+ trace_ocfs2_adjust_rightmost_branch(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ root_end, new_cpos);
+
+ status = ocfs2_adjust_rightmost_branch(handle, et);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
/* allocate the number of new eb blocks we need */
new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -295,7 +1215,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+ status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
meta_ac, new_eb_bhs);
if (status < 0) {
mlog_errno(status);
@@ -313,15 +1233,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
for(i = 0; i < new_blocks; i++) {
bh = new_eb_bhs[i];
eb = (struct ocfs2_extent_block *) bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
+ /* ocfs2_create_new_meta_bhs() should create it right! */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -330,18 +1247,22 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
eb->h_next_leaf_blk = 0;
eb_el->l_tree_depth = cpu_to_le16(i);
eb_el->l_next_free_rec = cpu_to_le16(1);
- eb_el->l_recs[0].e_cpos = fe->i_clusters;
+ /*
+ * This actually counts as an empty extent as
+ * c_clusters == 0
+ */
+ eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
- eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+ /*
+ * eb_el isn't always an interior node, but even leaf
+ * nodes want a zero'd flags and reserved field so
+ * this gets the whole 32 bits regardless of use.
+ */
+ eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
+ ocfs2_journal_dirty(handle, bh);
next_blkno = le64_to_cpu(eb->h_blkno);
}
@@ -351,21 +1272,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (eb_bh) {
- status = ocfs2_journal_access(handle, inode, eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -373,42 +1294,41 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
}
/* Link the new branch into the rest of the tree (el will
- * either be on the fe, or the extent block passed in. */
+ * either be on the root_bh, or the extent block passed in. */
i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
- el->l_recs[i].e_cpos = fe->i_clusters;
- el->l_recs[i].e_clusters = 0;
+ el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+ el->l_recs[i].e_int_clusters = 0;
le16_add_cpu(&el->l_next_free_rec, 1);
/* fe needs a new last extent block pointer, as does the
* next_leaf on the previously last-extent-block. */
- fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+ ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
- status = ocfs2_journal_dirty(handle, last_eb_bh);
- if (status < 0)
- mlog_errno(status);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0)
- mlog_errno(status);
- if (eb_bh) {
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0)
- mlog_errno(status);
- }
+ ocfs2_journal_dirty(handle, *last_eb_bh);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (eb_bh)
+ ocfs2_journal_dirty(handle, eb_bh);
+
+ /*
+ * Some callers want to track the rightmost leaf so pass it
+ * back here.
+ */
+ brelse(*last_eb_bh);
+ get_bh(new_eb_bhs[0]);
+ *last_eb_bh = new_eb_bhs[0];
status = 0;
bail:
if (new_eb_bhs) {
for (i = 0; i < new_blocks; i++)
- if (new_eb_bhs[i])
- brelse(new_eb_bhs[i]);
+ brelse(new_eb_bhs[i]);
kfree(new_eb_bhs);
}
- mlog_exit(status);
return status;
}
@@ -417,23 +1337,19 @@ bail:
* returns back the new extent block so you can add a branch to it
* after this call.
*/
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
+static int ocfs2_shift_tree_depth(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
{
int status, i;
+ u32 new_clusters;
struct buffer_head *new_eb_bh = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *fe_el;
+ struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
- mlog_entry_void();
-
- status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
&new_eb_bh);
if (status < 0) {
mlog_errno(status);
@@ -441,269 +1357,58 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
}
eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
+ /* ocfs2_create_new_meta_bhs() should create it right! */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- fe_el = &fe->id2.i_list;
+ root_el = et->et_root_el;
- status = ocfs2_journal_access(handle, inode, new_eb_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- /* copy the fe data into the new extent block */
- eb_el->l_tree_depth = fe_el->l_tree_depth;
- eb_el->l_next_free_rec = fe_el->l_next_free_rec;
- for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
- eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
- eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
- eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
- }
+ /* copy the root extent list data into the new extent block */
+ eb_el->l_tree_depth = root_el->l_tree_depth;
+ eb_el->l_next_free_rec = root_el->l_next_free_rec;
+ for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ eb_el->l_recs[i] = root_el->l_recs[i];
- status = ocfs2_journal_dirty(handle, new_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_eb_bh);
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- /* update fe now */
- le16_add_cpu(&fe_el->l_tree_depth, 1);
- fe_el->l_recs[0].e_cpos = 0;
- fe_el->l_recs[0].e_blkno = eb->h_blkno;
- fe_el->l_recs[0].e_clusters = fe->i_clusters;
- for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
- fe_el->l_recs[i].e_cpos = 0;
- fe_el->l_recs[i].e_clusters = 0;
- fe_el->l_recs[i].e_blkno = 0;
- }
- fe_el->l_next_free_rec = cpu_to_le16(1);
+ new_clusters = ocfs2_sum_rightmost_rec(eb_el);
+
+ /* update root_bh now */
+ le16_add_cpu(&root_el->l_tree_depth, 1);
+ root_el->l_recs[0].e_cpos = 0;
+ root_el->l_recs[0].e_blkno = eb->h_blkno;
+ root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+ for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+ root_el->l_next_free_rec = cpu_to_le16(1);
/* If this is our 1st tree depth shift, then last_eb_blk
* becomes the allocated extent block */
- if (fe_el->l_tree_depth == cpu_to_le16(1))
- fe->i_last_eb_blk = eb->h_blkno;
+ if (root_el->l_tree_depth == cpu_to_le16(1))
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
*ret_new_eb_bh = new_eb_bh;
new_eb_bh = NULL;
status = 0;
bail:
- if (new_eb_bh)
- brelse(new_eb_bh);
-
- mlog_exit(status);
- return status;
-}
-
-/*
- * Expects the tree to already have room in the rightmost leaf for the
- * extent. Updates all the extent blocks (and the dinode) on the way
- * down.
- */
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 start_blk,
- u32 new_clusters)
-{
- int status, i, num_bhs = 0;
- u64 next_blkno;
- u16 next_free;
- struct buffer_head **eb_bhs = NULL;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
-
- mlog_entry_void();
-
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
- if (el->l_tree_depth) {
- /* This is another operation where we want to be
- * careful about our tree updates. An error here means
- * none of the previous changes we made should roll
- * forward. As a result, we have to record the buffers
- * for this part of the tree in an array and reserve a
- * journal write to them before making any changes. */
- num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
- eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
- GFP_KERNEL);
- if (!eb_bhs) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- i = 0;
- while(el->l_tree_depth) {
- next_free = le16_to_cpu(el->l_next_free_rec);
- if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = -EIO;
- goto bail;
- }
- next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
-
- BUG_ON(i >= num_bhs);
- status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
- OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
- eb);
- status = -EIO;
- goto bail;
- }
-
- status = ocfs2_journal_access(handle, inode, eb_bhs[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- el = &eb->h_list;
- i++;
- /* When we leave this loop, eb_bhs[num_bhs - 1] will
- * hold the bottom-most leaf extent block. */
- }
- BUG_ON(el->l_tree_depth);
-
- el = &fe->id2.i_list;
- /* If we have tree depth, then the fe update is
- * trivial, and we want to switch el out for the
- * bottom-most leaf in order to update it with the
- * actual extent data below. */
- next_free = le16_to_cpu(el->l_next_free_rec);
- if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = -EIO;
- goto bail;
- }
- le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
- new_clusters);
- /* (num_bhs - 1) to avoid the leaf */
- for(i = 0; i < (num_bhs - 1); i++) {
- eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
- el = &eb->h_list;
-
- /* finally, make our actual change to the
- * intermediate extent blocks. */
- next_free = le16_to_cpu(el->l_next_free_rec);
- le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
- new_clusters);
-
- status = ocfs2_journal_dirty(handle, eb_bhs[i]);
- if (status < 0)
- mlog_errno(status);
- }
- BUG_ON(i != (num_bhs - 1));
- /* note that the leaf block wasn't touched in
- * the loop above */
- eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
- el = &eb->h_list;
- BUG_ON(el->l_tree_depth);
- }
-
- /* yay, we can finally add the actual extent now! */
- i = le16_to_cpu(el->l_next_free_rec) - 1;
- if (le16_to_cpu(el->l_next_free_rec) &&
- ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
- le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
- } else if (le16_to_cpu(el->l_next_free_rec) &&
- (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
- /* having an empty extent at eof is legal. */
- if (el->l_recs[i].e_cpos != fe->i_clusters) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu trailing extent is bad: "
- "cpos (%u) != number of clusters (%u)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(fe->i_clusters));
- status = -EIO;
- goto bail;
- }
- el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
- el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
- } else {
- /* No contiguous record, or no empty record at eof, so
- * we add a new one. */
-
- BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
- le16_to_cpu(el->l_count));
- i = le16_to_cpu(el->l_next_free_rec);
-
- el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
- el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
- el->l_recs[i].e_cpos = fe->i_clusters;
- le16_add_cpu(&el->l_next_free_rec, 1);
- }
-
- /*
- * extent_map errors are not fatal, so they are ignored outside
- * of flushing the thing.
- */
- status = ocfs2_extent_map_append(inode, &el->l_recs[i],
- new_clusters);
- if (status) {
- mlog_errno(status);
- ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
- }
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0)
- mlog_errno(status);
- if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
- if (status < 0)
- mlog_errno(status);
- }
-
- status = 0;
-bail:
- if (eb_bhs) {
- for (i = 0; i < num_bhs; i++)
- if (eb_bhs[i])
- brelse(eb_bhs[i]);
- kfree(eb_bhs);
- }
+ brelse(new_eb_bh);
- mlog_exit(status);
return status;
}
@@ -716,77 +1421,64 @@ bail:
* 1) a lowest extent block is found, then we pass it back in
* *lowest_eb_bh and return '0'
*
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
* pass NULL back in *lowest_eb_bh, but still return '0'
*
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
* which case we return > 0
*
* return status < 0 indicates an error.
*/
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
+static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
struct buffer_head **target_bh)
{
int status = 0, i;
u64 blkno;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL;
struct buffer_head *lowest_bh = NULL;
- mlog_entry_void();
-
*target_bh = NULL;
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
+ el = et->et_root_el;
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(inode->i_sb, "Dinode %llu has empty "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty "
"extent list (next_free_rec == 0)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
- ocfs2_error(inode->i_sb, "Dinode %llu has extent "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has extent "
"list where extent # %d has no physical "
"block start",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
+ brelse(bh);
+ bh = NULL;
- status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
el = &eb->h_list;
if (le16_to_cpu(el->l_next_free_rec) <
le16_to_cpu(el->l_count)) {
- if (lowest_bh)
- brelse(lowest_bh);
+ brelse(lowest_bh);
lowest_bh = bh;
get_bh(lowest_bh);
}
@@ -794,130 +1486,4282 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
/* If we didn't find one and the fe doesn't have any room,
* then return '1' */
- if (!lowest_bh
- && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+ el = et->et_root_el;
+ if (!lowest_bh && (el->l_next_free_rec == el->l_count))
status = 1;
*target_bh = lowest_bh;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
- mlog_exit(status);
return status;
}
-/* the caller needs to update fe->i_clusters */
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 start_blk,
- u32 new_clusters,
- struct ocfs2_alloc_context *meta_ac)
+/*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ int *final_depth, struct buffer_head **last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac)
{
- int status, i, shift;
- struct buffer_head *last_eb_bh = NULL;
+ int ret, shift;
+ struct ocfs2_extent_list *el = et->et_root_el;
+ int depth = le16_to_cpu(el->l_tree_depth);
+ struct buffer_head *bh = NULL;
+
+ BUG_ON(meta_ac == NULL);
+
+ shift = ocfs2_find_branch_target(et, &bh);
+ if (shift < 0) {
+ ret = shift;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* We traveled all the way to the bottom of the allocation tree
+ * and didn't find room for any more extents - we need to add
+ * another tree level */
+ if (shift) {
+ BUG_ON(bh);
+ trace_ocfs2_grow_tree(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ depth);
+
+ /* ocfs2_shift_tree_depth will return us a buffer with
+ * the new extent block (so we can pass that to
+ * ocfs2_add_branch). */
+ ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ depth++;
+ if (depth == 1) {
+ /*
+ * Special case: we have room now if we shifted from
+ * tree_depth 0, so no more work needs to be done.
+ *
+ * We won't be calling add_branch, so pass
+ * back *last_eb_bh as the new leaf. At depth
+ * zero, it should always be null so there's
+ * no reason to brelse.
+ */
+ BUG_ON(*last_eb_bh);
+ get_bh(bh);
+ *last_eb_bh = bh;
+ goto out;
+ }
+ }
+
+ /* call ocfs2_add_branch to add the final part of the tree with
+ * the new data. */
+ ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
+ meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (final_depth)
+ *final_depth = depth;
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+ int count = le16_to_cpu(el->l_count);
+ unsigned int num_bytes;
+
+ BUG_ON(!next_free);
+ /* This will cause us to go off the end of our extent list. */
+ BUG_ON(next_free >= count);
+
+ num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+
+ memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i, insert_index, next_free, has_empty, num_bytes;
+ u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+ struct ocfs2_extent_rec *rec;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+ BUG_ON(!next_free);
+
+ /* The tree code before us didn't allow enough room in the leaf. */
+ BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
+
+ /*
+ * The easiest way to approach this is to just remove the
+ * empty extent and temporarily decrement next_free.
+ */
+ if (has_empty) {
+ /*
+ * If next_free was 1 (only an empty extent), this
+ * loop won't execute, which is fine. We still want
+ * the decrement above to happen.
+ */
+ for(i = 0; i < (next_free - 1); i++)
+ el->l_recs[i] = el->l_recs[i+1];
+
+ next_free--;
+ }
+
+ /*
+ * Figure out what the new record index should be.
+ */
+ for(i = 0; i < next_free; i++) {
+ rec = &el->l_recs[i];
+
+ if (insert_cpos < le32_to_cpu(rec->e_cpos))
+ break;
+ }
+ insert_index = i;
+
+ trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+ has_empty, next_free,
+ le16_to_cpu(el->l_count));
+
+ BUG_ON(insert_index < 0);
+ BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+ BUG_ON(insert_index > next_free);
+
+ /*
+ * No need to memmove if we're just adding to the tail.
+ */
+ if (insert_index != next_free) {
+ BUG_ON(next_free >= le16_to_cpu(el->l_count));
+
+ num_bytes = next_free - insert_index;
+ num_bytes *= sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[insert_index + 1],
+ &el->l_recs[insert_index],
+ num_bytes);
+ }
+
+ /*
+ * Either we had an empty extent, and need to re-increment or
+ * there was no empty extent on a non full rightmost leaf node,
+ * in which case we still need to increment.
+ */
+ next_free++;
+ el->l_next_free_rec = cpu_to_le16(next_free);
+ /*
+ * Make sure none of the math above just messed up our tree.
+ */
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+
+ el->l_recs[insert_index] = *insert_rec;
+
+}
+
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+ int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+ BUG_ON(num_recs == 0);
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ num_recs--;
+ size = num_recs * sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[0], &el->l_recs[1], size);
+ memset(&el->l_recs[num_recs], 0,
+ sizeof(struct ocfs2_extent_rec));
+ el->l_next_free_rec = cpu_to_le16(num_recs);
+ }
+}
+
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (next_free == 0)
+ goto set_and_inc;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0]))
+ return;
+
+ mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+ "Asked to create an empty extent in a full list:\n"
+ "count = %u, tree depth = %u",
+ le16_to_cpu(el->l_count),
+ le16_to_cpu(el->l_tree_depth));
+
+ ocfs2_shift_records_right(el);
+
+set_and_inc:
+ le16_add_cpu(&el->l_next_free_rec, 1);
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right)
+{
+ int i = 0;
+
+ /*
+ * Check that the caller passed in two paths from the same tree.
+ */
+ BUG_ON(path_root_bh(left) != path_root_bh(right));
+
+ do {
+ i++;
+
+ /*
+ * The caller didn't pass two adjacent paths.
+ */
+ mlog_bug_on_msg(i > left->p_tree_depth,
+ "Owner %llu, left depth %u, right depth %u\n"
+ "left leaf blk %llu, right leaf blk %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ left->p_tree_depth, right->p_tree_depth,
+ (unsigned long long)path_leaf_bh(left)->b_blocknr,
+ (unsigned long long)path_leaf_bh(right)->b_blocknr);
+ } while (left->p_node[i].bh->b_blocknr ==
+ right->p_node[i].bh->b_blocknr);
+
+ return i - 1;
+}
+
+typedef void (path_insert_t)(void *, struct buffer_head *);
+
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ path_insert_t *func, void *data)
+{
+ int i, ret = 0;
+ u32 range;
+ u64 blkno;
struct buffer_head *bh = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
- mlog_entry_void();
+ el = root_el;
+ while (el->l_tree_depth) {
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has empty extent list at "
+ "depth %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ le16_to_cpu(el->l_tree_depth));
+ ret = -EROFS;
+ goto out;
- mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
- new_clusters, (unsigned long long)start_blk,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ }
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
+ rec = &el->l_recs[i];
+
+ /*
+ * In the case that cpos is off the allocation
+ * tree, this should just wind up returning the
+ * rightmost record.
+ */
+ range = le32_to_cpu(rec->e_cpos) +
+ ocfs2_rec_clusters(el, rec);
+ if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+ break;
+ }
- if (el->l_tree_depth) {
- /* jump to end of tree */
- status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_exit(status);
- goto bail;
+ blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+ if (blkno == 0) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has bad blkno in extent list "
+ "at depth %u (index %d)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ le16_to_cpu(el->l_tree_depth), i);
+ ret = -EROFS;
+ goto out;
}
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+ brelse(bh);
+ bh = NULL;
+ ret = ocfs2_read_extent_block(ci, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ el = &eb->h_list;
+
+ if (le16_to_cpu(el->l_next_free_rec) >
+ le16_to_cpu(el->l_count)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has bad count in extent list "
+ "at block %llu (next free=%u, count=%u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(el->l_next_free_rec),
+ le16_to_cpu(el->l_count));
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (func)
+ func(data, bh);
+ }
+
+out:
+ /*
+ * Catch any trailing bh that the loop didn't handle.
+ */
+ brelse(bh);
+
+ return ret;
+}
+
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+ int index;
+ struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+ struct find_path_data *fp = data;
+
+ get_bh(bh);
+ ocfs2_path_insert_eb(fp->path, fp->index, bh);
+ fp->index++;
+}
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path, u32 cpos)
+{
+ struct find_path_data data;
+
+ data.index = 1;
+ data.path = path;
+ return __ocfs2_find_path(ci, path_root_el(path), cpos,
+ find_path_ins, &data);
+}
+
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+ struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+ struct ocfs2_extent_list *el = &eb->h_list;
+ struct buffer_head **ret = data;
+
+ /* We want to retain only the leaf block. */
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ get_bh(bh);
+ *ret = bh;
+ }
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ struct buffer_head **leaf_bh)
+{
+ int ret;
+ struct buffer_head *bh = NULL;
+
+ ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *leaf_bh = bh;
+out:
+ return ret;
+}
+
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+ struct ocfs2_extent_list *left_child_el,
+ struct ocfs2_extent_rec *right_rec,
+ struct ocfs2_extent_list *right_child_el)
+{
+ u32 left_clusters, right_end;
+
+ /*
+ * Interior nodes never have holes. Their cpos is the cpos of
+ * the leftmost record in their child list. Their cluster
+ * count covers the full theoretical range of their child list
+ * - the range between their cpos and the cpos of the record
+ * immediately to their right.
+ */
+ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+ if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+ BUG_ON(right_child_el->l_tree_depth);
+ BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+ left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+ }
+ left_clusters -= le32_to_cpu(left_rec->e_cpos);
+ left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+
+ /*
+ * Calculate the rightmost cluster count boundary before
+ * moving cpos - we will need to adjust clusters after
+ * updating e_cpos to keep the same highest cluster count.
+ */
+ right_end = le32_to_cpu(right_rec->e_cpos);
+ right_end += le32_to_cpu(right_rec->e_int_clusters);
+
+ right_rec->e_cpos = left_rec->e_cpos;
+ le32_add_cpu(&right_rec->e_cpos, left_clusters);
+
+ right_end -= le32_to_cpu(right_rec->e_cpos);
+ right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+ struct ocfs2_extent_list *left_el,
+ struct ocfs2_extent_list *right_el,
+ u64 left_el_blkno)
+{
+ int i;
+
+ BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+ le16_to_cpu(left_el->l_tree_depth));
+
+ for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+ if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+ break;
+ }
+
+ /*
+ * The path walking code should have never returned a root and
+ * two paths which are not adjacent.
+ */
+ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+
+ ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+ &root_el->l_recs[i + 1], right_el);
+}
+
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ * - When we've moved an extent record from the left path leaf to the right
+ * path leaf to make room for an empty extent in the left path leaf.
+ * - When our insert into the right path leaf is at the leftmost edge
+ * and requires an update of the path immediately to it's left. This
+ * can occur at the end of some types of rotation and appending inserts.
+ * - When we've adjusted the last extent record in the left path leaf and the
+ * 1st extent record in the right path leaf during cross extent block merge.
+ */
+static void ocfs2_complete_edge_insert(handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index)
+{
+ int i, idx;
+ struct ocfs2_extent_list *el, *left_el, *right_el;
+ struct ocfs2_extent_rec *left_rec, *right_rec;
+ struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+
+ /*
+ * Update the counts and position values within all the
+ * interior nodes to reflect the leaf rotation we just did.
+ *
+ * The root node is handled below the loop.
+ *
+ * We begin the loop with right_el and left_el pointing to the
+ * leaf lists and work our way up.
+ *
+ * NOTE: within this loop, left_el and right_el always refer
+ * to the *child* lists.
+ */
+ left_el = path_leaf_el(left_path);
+ right_el = path_leaf_el(right_path);
+ for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+ trace_ocfs2_complete_edge_insert(i);
+
+ /*
+ * One nice property of knowing that all of these
+ * nodes are below the root is that we only deal with
+ * the leftmost right node record and the rightmost
+ * left node record.
+ */
+ el = left_path->p_node[i].el;
+ idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+ left_rec = &el->l_recs[idx];
+
+ el = right_path->p_node[i].el;
+ right_rec = &el->l_recs[0];
+
+ ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+ right_el);
+
+ ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+ ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+
+ /*
+ * Setup our list pointers now so that the current
+ * parents become children in the next iteration.
+ */
+ left_el = left_path->p_node[i].el;
+ right_el = right_path->p_node[i].el;
+ }
+
+ /*
+ * At the root node, adjust the two adjacent records which
+ * begin our path to the leaves.
+ */
+
+ el = left_path->p_node[subtree_index].el;
+ left_el = left_path->p_node[subtree_index + 1].el;
+ right_el = right_path->p_node[subtree_index + 1].el;
+
+ ocfs2_adjust_root_records(el, left_el, right_el,
+ left_path->p_node[subtree_index + 1].bh->b_blocknr);
+
+ root_bh = left_path->p_node[subtree_index].bh;
+
+ ocfs2_journal_dirty(handle, root_bh);
+}
+
+static int ocfs2_rotate_subtree_right(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index)
+{
+ int ret, i;
+ struct buffer_head *right_leaf_bh;
+ struct buffer_head *left_leaf_bh = NULL;
+ struct buffer_head *root_bh;
+ struct ocfs2_extent_list *right_el, *left_el;
+ struct ocfs2_extent_rec move_rec;
+
+ left_leaf_bh = path_leaf_bh(left_path);
+ left_el = path_leaf_el(left_path);
+
+ if (left_el->l_next_free_rec != left_el->l_count) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Inode %llu has non-full interior leaf node %llu"
+ "(next free = %u)",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)left_leaf_bh->b_blocknr,
+ le16_to_cpu(left_el->l_next_free_rec));
+ return -EROFS;
+ }
+
+ /*
+ * This extent block may already have an empty record, so we
+ * return early if so.
+ */
+ if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+ return 0;
+
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+ subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ right_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ left_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ right_leaf_bh = path_leaf_bh(right_path);
+ right_el = path_leaf_el(right_path);
+
+ /* This is a code error, not a disk corruption. */
+ mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+ "because rightmost leaf block %llu is empty\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)right_leaf_bh->b_blocknr);
+
+ ocfs2_create_empty_extent(right_el);
+
+ ocfs2_journal_dirty(handle, right_leaf_bh);
+
+ /* Do the copy now. */
+ i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+ move_rec = left_el->l_recs[i];
+ right_el->l_recs[0] = move_rec;
+
+ /*
+ * Clear out the record we just copied and shift everything
+ * over, leaving an empty extent in the left leaf.
+ *
+ * We temporarily subtract from next_free_rec so that the
+ * shift will lose the tail record (which is now defunct).
+ */
+ le16_add_cpu(&left_el->l_next_free_rec, -1);
+ ocfs2_shift_records_right(left_el);
+ memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&left_el->l_next_free_rec, 1);
+
+ ocfs2_journal_dirty(handle, left_leaf_bh);
+
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
+{
+ int i, j, ret = 0;
+ u64 blkno;
+ struct ocfs2_extent_list *el;
+
+ BUG_ON(path->p_tree_depth == 0);
+
+ *cpos = 0;
+
+ blkno = path_leaf_bh(path)->b_blocknr;
+
+ /* Start at the tree node just above the leaf and work our way up. */
+ i = path->p_tree_depth - 1;
+ while (i >= 0) {
+ el = path->p_node[i].el;
+
+ /*
+ * Find the extent record just before the one in our
+ * path.
+ */
+ for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+ if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+ if (j == 0) {
+ if (i == 0) {
+ /*
+ * We've determined that the
+ * path specified is already
+ * the leftmost one - return a
+ * cpos of zero.
+ */
+ goto out;
+ }
+ /*
+ * The leftmost record points to our
+ * leaf - we need to travel up the
+ * tree one level.
+ */
+ goto next_node;
+ }
+
+ *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+ *cpos = *cpos + ocfs2_rec_clusters(el,
+ &el->l_recs[j - 1]);
+ *cpos = *cpos - 1;
+ goto out;
+ }
+ }
+
+ /*
+ * If we got here, we never found a valid node where
+ * the tree indicated one should be.
+ */
+ ocfs2_error(sb,
+ "Invalid extent tree at extent block %llu\n",
+ (unsigned long long)blkno);
+ ret = -EROFS;
+ goto out;
+
+next_node:
+ blkno = path->p_node[i].bh->b_blocknr;
+ i--;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+ int op_credits,
+ struct ocfs2_path *path)
+{
+ int ret = 0;
+ int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
+
+ if (handle->h_buffer_credits < credits)
+ ret = ocfs2_extend_trans(handle,
+ credits - handle->h_buffer_credits);
+
+ return ret;
+}
+
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+ u32 insert_cpos)
+{
+ struct ocfs2_extent_list *left_el;
+ struct ocfs2_extent_rec *rec;
+ int next_free;
+
+ left_el = path_leaf_el(left_path);
+ next_free = le16_to_cpu(left_el->l_next_free_rec);
+ rec = &left_el->l_recs[next_free - 1];
+
+ if (insert_cpos > le32_to_cpu(rec->e_cpos))
+ return 1;
+ return 0;
+}
+
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+ unsigned int range;
+ struct ocfs2_extent_rec *rec;
+
+ if (next_free == 0)
+ return 0;
+
+ rec = &el->l_recs[0];
+ if (ocfs2_is_empty_extent(rec)) {
+ /* Empty list. */
+ if (next_free == 1)
+ return 0;
+ rec = &el->l_recs[1];
+ }
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+ return 1;
+ return 0;
+}
+
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon successful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ * whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ * *ret_left_path will contain a valid path which can be passed to
+ * ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ enum ocfs2_split_type split,
+ u32 insert_cpos,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret, start, orig_credits = handle->h_buffer_credits;
+ u32 cpos;
+ struct ocfs2_path *left_path = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+ *ret_left_path = NULL;
+
+ left_path = ocfs2_new_path_from_path(right_path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_rotate_tree_right(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos);
+
+ /*
+ * What we want to do here is:
+ *
+ * 1) Start with the rightmost path.
+ *
+ * 2) Determine a path to the leaf block directly to the left
+ * of that leaf.
+ *
+ * 3) Determine the 'subtree root' - the lowest level tree node
+ * which contains a path to both leaves.
+ *
+ * 4) Rotate the subtree.
+ *
+ * 5) Find the next subtree by considering the left path to be
+ * the new right path.
+ *
+ * The check at the top of this while loop also accepts
+ * insert_cpos == cpos because cpos is only a _theoretical_
+ * value to get us the left path - insert_cpos might very well
+ * be filling that hole.
+ *
+ * Stop at a cpos of '0' because we either started at the
+ * leftmost branch (i.e., a tree with one branch and a
+ * rotation inside of it), or we've gone as far as we can in
+ * rotating subtrees.
+ */
+ while (cpos && insert_cpos <= cpos) {
+ trace_ocfs2_rotate_tree_right(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos);
+
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog_bug_on_msg(path_leaf_bh(left_path) ==
+ path_leaf_bh(right_path),
+ "Owner %llu: error during insert of %u "
+ "(left path cpos %u) results in two identical "
+ "paths ending at %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos,
+ (unsigned long long)
+ path_leaf_bh(left_path)->b_blocknr);
+
+ if (split == SPLIT_NONE &&
+ ocfs2_rotate_requires_path_adjustment(left_path,
+ insert_cpos)) {
+
+ /*
+ * We've rotated the tree as much as we
+ * should. The rest is up to
+ * ocfs2_insert_path() to complete, after the
+ * record insertion. We indicate this
+ * situation by returning the left path.
+ *
+ * The reason we don't adjust the records here
+ * before the record insert is that an error
+ * later might break the rule where a parent
+ * record e_cpos will reflect the actual
+ * e_cpos of the 1st nonempty record of the
+ * child list.
+ */
+ *ret_left_path = left_path;
+ goto out_ret_path;
+ }
+
+ start = ocfs2_find_subtree_root(et, left_path, right_path);
+
+ trace_ocfs2_rotate_subtree(start,
+ (unsigned long long)
+ right_path->p_node[start].bh->b_blocknr,
+ right_path->p_tree_depth);
+
+ ret = ocfs2_extend_rotate_transaction(handle, start,
+ orig_credits, right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_subtree_right(handle, et, left_path,
+ right_path, start);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (split != SPLIT_NONE &&
+ ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+ insert_cpos)) {
+ /*
+ * A rotate moves the rightmost left leaf
+ * record over to the leftmost right leaf
+ * slot. If we're doing an extent split
+ * instead of a real insert, then we have to
+ * check that the extent to be split wasn't
+ * just moved over. If it was, then we can
+ * exit here, passing left_path back -
+ * ocfs2_split_extent() is smart enough to
+ * search both leaves.
+ */
+ *ret_left_path = left_path;
+ goto out_ret_path;
+ }
+
+ /*
+ * There is no need to re-read the next right path
+ * as we know that it'll be our current left
+ * path. Optimize by copying values instead.
+ */
+ ocfs2_mv_path(right_path, left_path);
+
+ ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(left_path);
+
+out_ret_path:
+ return ret;
+}
+
+static int ocfs2_update_edge_lengths(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ int subtree_index, struct ocfs2_path *path)
+{
+ int i, idx, ret;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+ u32 range;
+
+ /*
+ * In normal tree rotation process, we will never touch the
+ * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+ * doesn't reserve the credits for them either.
+ *
+ * But we do have a special case here which will update the rightmost
+ * records for all the bh in the path.
+ * So we have to allocate extra credits and access them.
+ */
+ ret = ocfs2_extend_trans(handle, subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Path should always be rightmost. */
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ BUG_ON(eb->h_next_leaf_blk != 0ULL);
+
+ el = &eb->h_list;
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+ idx = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[idx];
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ for (i = 0; i < path->p_tree_depth; i++) {
+ el = path->p_node[i].el;
+ idx = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[idx];
+
+ rec->e_int_clusters = cpu_to_le32(range);
+ le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+
+ ocfs2_journal_dirty(handle, path->p_node[i].bh);
+ }
+out:
+ return ret;
+}
+
+static void ocfs2_unlink_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_path *path, int unlink_start)
+{
+ int ret, i;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct buffer_head *bh;
+
+ for(i = unlink_start; i < path_num_items(path); i++) {
+ bh = path->p_node[i].bh;
+
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ /*
+ * Not all nodes might have had their final count
+ * decremented by the caller - handle this here.
+ */
el = &eb->h_list;
+ if (le16_to_cpu(el->l_next_free_rec) > 1) {
+ mlog(ML_ERROR,
+ "Inode %llu, attempted to remove extent block "
+ "%llu with %u records\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_remove_from_cache(et->et_ci, bh);
+ continue;
+ }
+
+ el->l_next_free_rec = 0;
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+ ocfs2_journal_dirty(handle, bh);
+
+ ret = ocfs2_cache_extent_block_free(dealloc, eb);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_remove_from_cache(et->et_ci, bh);
+ }
+}
+
+static void ocfs2_unlink_subtree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int i;
+ struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+ struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+
+ el = path_leaf_el(left_path);
+
+ eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+
+ for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+ break;
+
+ BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+
+ memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ eb->h_next_leaf_blk = 0;
+
+ ocfs2_journal_dirty(handle, root_bh);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+ ocfs2_unlink_path(handle, et, dealloc, right_path,
+ subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int *deleted)
+{
+ int ret, i, del_right_subtree = 0, right_has_empty = 0;
+ struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
+ struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+ struct ocfs2_extent_block *eb;
+
+ *deleted = 0;
+
+ right_leaf_el = path_leaf_el(right_path);
+ left_leaf_el = path_leaf_el(left_path);
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+ return 0;
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+ if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+ /*
+ * It's legal for us to proceed if the right leaf is
+ * the rightmost one and it has an empty extent. There
+ * are two cases to handle - whether the leaf will be
+ * empty after removal or not. If the leaf isn't empty
+ * then just remove the empty extent up front. The
+ * next block will handle empty leaves by flagging
+ * them for unlink.
+ *
+ * Non rightmost leaves will throw -EAGAIN and the
+ * caller can manually move the subtree and retry.
+ */
+
+ if (eb->h_next_leaf_blk != 0ULL)
+ return -EAGAIN;
+
+ if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+ ret = ocfs2_journal_access_eb(handle, et->et_ci,
+ path_leaf_bh(right_path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_empty_extent(right_leaf_el);
+ } else
+ right_has_empty = 1;
+ }
+
+ if (eb->h_next_leaf_blk == 0ULL &&
+ le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+ /*
+ * We have to update i_last_eb_blk during the meta
+ * data delete.
+ */
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ del_right_subtree = 1;
+ }
+
+ /*
+ * Getting here with an empty extent in the right path implies
+ * that it's the rightmost path and will be deleted.
+ */
+ BUG_ON(right_has_empty && !del_right_subtree);
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+ subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ right_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ left_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (!right_has_empty) {
+ /*
+ * Only do this if we're moving a real
+ * record. Otherwise, the action is delayed until
+ * after removal of the right path in which case we
+ * can do a simple shift to remove the empty extent.
+ */
+ ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+ memset(&right_leaf_el->l_recs[0], 0,
+ sizeof(struct ocfs2_extent_rec));
+ }
+ if (eb->h_next_leaf_blk == 0ULL) {
+ /*
+ * Move recs over to get rid of empty extent, decrease
+ * next_free. This is allowed to remove the last
+ * extent in our leaf (setting l_next_free_rec to
+ * zero) - the delete code below won't care.
+ */
+ ocfs2_remove_empty_extent(right_leaf_el);
+ }
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+
+ if (del_right_subtree) {
+ ocfs2_unlink_subtree(handle, et, left_path, right_path,
+ subtree_index, dealloc);
+ ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+
+ /*
+ * Removal of the extent in the left leaf was skipped
+ * above so we could delete the right path
+ * 1st.
+ */
+ if (right_has_empty)
+ ocfs2_remove_empty_extent(left_leaf_el);
+
+ ocfs2_journal_dirty(handle, et_root_bh);
+
+ *deleted = 1;
+ } else
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
+{
+ int i, j, ret = 0;
+ u64 blkno;
+ struct ocfs2_extent_list *el;
+
+ *cpos = 0;
+
+ if (path->p_tree_depth == 0)
+ return 0;
+
+ blkno = path_leaf_bh(path)->b_blocknr;
+
+ /* Start at the tree node just above the leaf and work our way up. */
+ i = path->p_tree_depth - 1;
+ while (i >= 0) {
+ int next_free;
+
+ el = path->p_node[i].el;
+
+ /*
+ * Find the extent record just after the one in our
+ * path.
+ */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+ if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+ if (j == (next_free - 1)) {
+ if (i == 0) {
+ /*
+ * We've determined that the
+ * path specified is already
+ * the rightmost one - return a
+ * cpos of zero.
+ */
+ goto out;
+ }
+ /*
+ * The rightmost record points to our
+ * leaf - we need to travel up the
+ * tree one level.
+ */
+ goto next_node;
+ }
+
+ *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+ goto out;
+ }
+ }
+
+ /*
+ * If we got here, we never found a valid node where
+ * the tree indicated one should be.
+ */
+ ocfs2_error(sb,
+ "Invalid extent tree at extent block %llu\n",
+ (unsigned long long)blkno);
+ ret = -EROFS;
+ goto out;
+
+next_node:
+ blkno = path->p_node[i].bh->b_blocknr;
+ i--;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path)
+{
+ int ret;
+ struct buffer_head *bh = path_leaf_bh(path);
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+
+ if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+ return 0;
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+ path_num_items(path) - 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_empty_extent(el);
+ ocfs2_journal_dirty(handle, bh);
+
+out:
+ return ret;
+}
+
+static int __ocfs2_rotate_tree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ int orig_credits,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_path **empty_extent_path)
+{
+ int ret, subtree_root, deleted;
+ u32 right_cpos;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_path *right_path = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+ BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+
+ *empty_extent_path = NULL;
+
+ ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ left_path = ocfs2_new_path_from_path(path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_cp_path(left_path, path);
+
+ right_path = ocfs2_new_path_from_path(path);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (right_cpos) {
+ ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(et, left_path,
+ right_path);
+
+ trace_ocfs2_rotate_subtree(subtree_root,
+ (unsigned long long)
+ right_path->p_node[subtree_root].bh->b_blocknr,
+ right_path->p_tree_depth);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ orig_credits, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Caller might still want to make changes to the
+ * tree root, so re-add it to the journal here.
+ */
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ left_path, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_subtree_left(handle, et, left_path,
+ right_path, subtree_root,
+ dealloc, &deleted);
+ if (ret == -EAGAIN) {
+ /*
+ * The rotation has to temporarily stop due to
+ * the right subtree having an empty
+ * extent. Pass it back to the caller for a
+ * fixup.
+ */
+ *empty_extent_path = right_path;
+ right_path = NULL;
+ goto out;
+ }
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * The subtree rotate might have removed records on
+ * the rightmost edge. If so, then rotation is
+ * complete.
+ */
+ if (deleted)
+ break;
+
+ ocfs2_mv_path(left_path, right_path);
+
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
+ &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(right_path);
+ ocfs2_free_path(left_path);
+
+ return ret;
+}
+
+static int ocfs2_remove_rightmost_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, subtree_index;
+ u32 cpos;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+
+ ret = ocfs2_et_sanity_check(et);
+ if (ret)
+ goto out;
+ /*
+ * There's two ways we handle this depending on
+ * whether path is the only existing one.
+ */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (cpos) {
+ /*
+ * We have a path to the left of this one - it needs
+ * an update too.
+ */
+ left_path = ocfs2_new_path_from_path(path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+
+ ocfs2_unlink_subtree(handle, et, left_path, path,
+ subtree_index, dealloc);
+ ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
+ } else {
+ /*
+ * 'path' is also the leftmost path which
+ * means it must be the only one. This gets
+ * handled differently because we want to
+ * revert the root back to having extents
+ * in-line.
+ */
+ ocfs2_unlink_path(handle, et, dealloc, path, 1);
+
+ el = et->et_root_el;
+ el->l_tree_depth = 0;
+ el->l_next_free_rec = 0;
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+ ocfs2_et_set_last_eb_blk(et, 0);
+ }
+
+ ocfs2_journal_dirty(handle, path_root_bh(path));
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, orig_credits = handle->h_buffer_credits;
+ struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ el = path_leaf_el(path);
+ if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+ return 0;
+
+ if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+ /*
+ * Inline extents. This is trivially handled, so do
+ * it up front.
+ */
+ ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Handle rightmost branch now. There's several cases:
+ * 1) simple rotation leaving records in there. That's trivial.
+ * 2) rotation requiring a branch delete - there's no more
+ * records left. Two cases of this:
+ * a) There are branches to the left.
+ * b) This is also the leftmost (the only) branch.
+ *
+ * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
+ * 2a) we need the left branch so that we can update it with the unlink
+ * 2b) we need to bring the root back to inline extents.
+ */
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ el = &eb->h_list;
+ if (eb->h_next_leaf_blk == 0) {
+ /*
+ * This gets a bit tricky if we're going to delete the
+ * rightmost path. Get the other cases out of the way
+ * 1st.
+ */
+ if (le16_to_cpu(el->l_next_free_rec) > 1)
+ goto rightmost_no_delete;
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ret = -EIO;
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent block at %llu",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto out;
+ }
+
+ /*
+ * XXX: The caller can not trust "path" any more after
+ * this as it will have been deleted. What do we do?
+ *
+ * In theory the rotate-for-merge code will never get
+ * here because it'll always ask for a rotate in a
+ * nonempty list.
+ */
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Now we can loop, remembering the path we get from -EAGAIN
+ * and restarting from there.
+ */
+try_rotate:
+ ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
+ dealloc, &restart_path);
+ if (ret && ret != -EAGAIN) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (ret == -EAGAIN) {
+ tmp_path = restart_path;
+ restart_path = NULL;
+
+ ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
+ tmp_path, dealloc,
+ &restart_path);
+ if (ret && ret != -EAGAIN) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_free_path(tmp_path);
+ tmp_path = NULL;
+
+ if (ret == 0)
+ goto try_rotate;
+ }
+
+out:
+ ocfs2_free_path(tmp_path);
+ ocfs2_free_path(restart_path);
+ return ret;
+}
+
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+ int index)
+{
+ struct ocfs2_extent_rec *rec = &el->l_recs[index];
+ unsigned int size;
+
+ if (rec->e_leaf_clusters == 0) {
+ /*
+ * We consumed all of the merged-from record. An empty
+ * extent cannot exist anywhere but the 1st array
+ * position, so move things over if the merged-from
+ * record doesn't occupy that position.
+ *
+ * This creates a new empty extent so the caller
+ * should be smart enough to have removed any existing
+ * ones.
+ */
+ if (index > 0) {
+ BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+ size = index * sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[1], &el->l_recs[0], size);
+ }
+
+ /*
+ * Always memset - the caller doesn't check whether it
+ * created an empty extent, so there could be junk in
+ * the other fields.
+ */
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+ }
+}
+
+static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path **ret_right_path)
+{
+ int ret;
+ u32 right_cpos;
+ struct ocfs2_path *right_path = NULL;
+ struct ocfs2_extent_list *left_el;
+
+ *ret_right_path = NULL;
+
+ /* This function shouldn't be called for non-trees. */
+ BUG_ON(left_path->p_tree_depth == 0);
+
+ left_el = path_leaf_el(left_path);
+ BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+
+ ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ left_path, &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* This function shouldn't be called for the rightmost leaf. */
+ BUG_ON(right_cpos == 0);
+
+ right_path = ocfs2_new_path_from_path(left_path);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *ret_right_path = right_path;
+out:
+ if (ret)
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
+ */
+static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
+ handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *split_rec,
+ int index)
+{
+ int ret, next_free, i;
+ unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+ struct ocfs2_extent_rec *left_rec;
+ struct ocfs2_extent_rec *right_rec;
+ struct ocfs2_extent_list *right_el;
+ struct ocfs2_path *right_path = NULL;
+ int subtree_index = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(left_path);
+ struct buffer_head *bh = path_leaf_bh(left_path);
+ struct buffer_head *root_bh = NULL;
+
+ BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+ left_rec = &el->l_recs[index];
+
+ if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
+ le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+ /* we meet with a cross extent block merge. */
+ ret = ocfs2_get_right_path(et, left_path, &right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ right_el = path_leaf_el(right_path);
+ next_free = le16_to_cpu(right_el->l_next_free_rec);
+ BUG_ON(next_free <= 0);
+ right_rec = &right_el->l_recs[0];
+ if (ocfs2_is_empty_extent(right_rec)) {
+ BUG_ON(next_free <= 1);
+ right_rec = &right_el->l_recs[1];
+ }
+
+ BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+ le16_to_cpu(left_rec->e_leaf_clusters) !=
+ le32_to_cpu(right_rec->e_cpos));
+
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+ subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = subtree_index + 1;
+ i < path_num_items(right_path); i++) {
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ right_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ left_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ } else {
+ BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+ right_rec = &el->l_recs[index + 1];
+ }
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
+ path_num_items(left_path) - 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+
+ le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+ le64_add_cpu(&right_rec->e_blkno,
+ -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+ split_clusters));
+ le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+
+ ocfs2_cleanup_merge(el, index);
+
+ ocfs2_journal_dirty(handle, bh);
+ if (right_path) {
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
+ }
+out:
+ if (right_path)
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret;
+ u32 left_cpos;
+ struct ocfs2_path *left_path = NULL;
+
+ *ret_left_path = NULL;
+
+ /* This function shouldn't be called for non-trees. */
+ BUG_ON(right_path->p_tree_depth == 0);
+
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ right_path, &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* This function shouldn't be called for the leftmost leaf. */
+ BUG_ON(left_cpos == 0);
+
+ left_path = ocfs2_new_path_from_path(right_path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *ret_left_path = left_path;
+out:
+ if (ret)
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
+ */
+static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
+ handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int index)
+{
+ int ret, i, subtree_index = 0, has_empty_extent = 0;
+ unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+ struct ocfs2_extent_rec *left_rec;
+ struct ocfs2_extent_rec *right_rec;
+ struct ocfs2_extent_list *el = path_leaf_el(right_path);
+ struct buffer_head *bh = path_leaf_bh(right_path);
+ struct buffer_head *root_bh = NULL;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *left_el;
+
+ BUG_ON(index < 0);
+
+ right_rec = &el->l_recs[index];
+ if (index == 0) {
+ /* we meet with a cross extent block merge. */
+ ret = ocfs2_get_left_path(et, right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ left_el = path_leaf_el(left_path);
+ BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+ le16_to_cpu(left_el->l_count));
+
+ left_rec = &left_el->l_recs[
+ le16_to_cpu(left_el->l_next_free_rec) - 1];
+ BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+ le16_to_cpu(left_rec->e_leaf_clusters) !=
+ le32_to_cpu(split_rec->e_cpos));
+
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+ handle->h_buffer_credits,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+ subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = subtree_index + 1;
+ i < path_num_items(right_path); i++) {
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ right_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
+ left_path, i);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ } else {
+ left_rec = &el->l_recs[index - 1];
+ if (ocfs2_is_empty_extent(&el->l_recs[0]))
+ has_empty_extent = 1;
+ }
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
+ path_num_items(right_path) - 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (has_empty_extent && index == 1) {
+ /*
+ * The easy case - we can just plop the record right in.
+ */
+ *left_rec = *split_rec;
+
+ has_empty_extent = 0;
+ } else
+ le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+
+ le32_add_cpu(&right_rec->e_cpos, split_clusters);
+ le64_add_cpu(&right_rec->e_blkno,
+ ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+ split_clusters));
+ le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+
+ ocfs2_cleanup_merge(el, index);
+
+ ocfs2_journal_dirty(handle, bh);
+ if (left_path) {
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+ /*
+ * In the situation that the right_rec is empty and the extent
+ * block is empty also, ocfs2_complete_edge_insert can't handle
+ * it and we need to delete the right extent block.
+ */
+ if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+ le16_to_cpu(el->l_next_free_rec) == 1) {
+
+ ret = ocfs2_remove_rightmost_path(handle, et,
+ right_path,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Now the rightmost extent block has been deleted.
+ * So we use the new rightmost path.
+ */
+ ocfs2_mv_path(right_path, left_path);
+ left_path = NULL;
+ } else
+ ocfs2_complete_edge_insert(handle, left_path,
+ right_path, subtree_index);
+ }
+out:
+ if (left_path)
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+static int ocfs2_try_to_merge_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_merge_ctxt *ctxt)
+{
+ int ret = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+
+ BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+
+ if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /*
+ * The merge code will need to create an empty
+ * extent to take the place of the newly
+ * emptied slot. Remove any pre-existing empty
+ * extents - having more than one in a leaf is
+ * illegal.
+ */
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ split_index--;
+ rec = &el->l_recs[split_index];
+ }
+
+ if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+ /*
+ * Left-right contig implies this.
+ */
+ BUG_ON(!ctxt->c_split_covers_rec);
+
+ /*
+ * Since the leftright insert always covers the entire
+ * extent, this call will delete the insert record
+ * entirely, resulting in an empty extent record added to
+ * the extent block.
+ *
+ * Since the adding of an empty extent shifts
+ * everything back to the right, there's no need to
+ * update split_index here.
+ *
+ * When the split_index is zero, we need to merge it to the
+ * prevoius extent block. It is more efficient and easier
+ * if we do merge_right first and merge_left later.
+ */
+ ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We can only get this from logic error above.
+ */
+ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ /* The merge left us with an empty extent, remove it. */
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rec = &el->l_recs[split_index];
+
+ /*
+ * Note that we don't pass split_rec here on purpose -
+ * we've merged it into the rec already.
+ */
+ ret = ocfs2_merge_rec_left(path, handle, et, rec,
+ dealloc, split_index);
+
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ /*
+ * Error from this last rotate is not critical, so
+ * print but don't bubble it up.
+ */
+ if (ret)
+ mlog_errno(ret);
+ ret = 0;
+ } else {
+ /*
+ * Merge a record to the left or right.
+ *
+ * 'contig_type' is relative to the existing record,
+ * so for example, if we're "right contig", it's to
+ * the record on the left (hence the left merge).
+ */
+ if (ctxt->c_contig_type == CONTIG_RIGHT) {
+ ret = ocfs2_merge_rec_left(path, handle, et,
+ split_rec, dealloc,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ ret = ocfs2_merge_rec_right(path, handle,
+ et, split_rec,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (ctxt->c_split_covers_rec) {
+ /*
+ * The merge may have left an empty extent in
+ * our leaf. Try to rotate it away.
+ */
+ ret = ocfs2_rotate_tree_left(handle, et, path,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ ret = 0;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+ enum ocfs2_split_type split,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_extent_rec *split_rec)
+{
+ u64 len_blocks;
+
+ len_blocks = ocfs2_clusters_to_blocks(sb,
+ le16_to_cpu(split_rec->e_leaf_clusters));
+
+ if (split == SPLIT_LEFT) {
+ /*
+ * Region is on the left edge of the existing
+ * record.
+ */
+ le32_add_cpu(&rec->e_cpos,
+ le16_to_cpu(split_rec->e_leaf_clusters));
+ le64_add_cpu(&rec->e_blkno, len_blocks);
+ le16_add_cpu(&rec->e_leaf_clusters,
+ -le16_to_cpu(split_rec->e_leaf_clusters));
+ } else {
+ /*
+ * Region is on the right edge of the existing
+ * record.
+ */
+ le16_add_cpu(&rec->e_leaf_clusters,
+ -le16_to_cpu(split_rec->e_leaf_clusters));
+ }
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_insert_type *insert)
+{
+ int i = insert->ins_contig_index;
+ unsigned int range;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (insert->ins_split != SPLIT_NONE) {
+ i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+ BUG_ON(i == -1);
+ rec = &el->l_recs[i];
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ insert->ins_split, rec,
+ insert_rec);
+ goto rotate;
+ }
+
+ /*
+ * Contiguous insert - either left or right.
+ */
+ if (insert->ins_contig != CONTIG_NONE) {
+ rec = &el->l_recs[i];
+ if (insert->ins_contig == CONTIG_LEFT) {
+ rec->e_blkno = insert_rec->e_blkno;
+ rec->e_cpos = insert_rec->e_cpos;
+ }
+ le16_add_cpu(&rec->e_leaf_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ return;
+ }
+
+ /*
+ * Handle insert into an empty leaf.
+ */
+ if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+ ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+ ocfs2_is_empty_extent(&el->l_recs[0]))) {
+ el->l_recs[0] = *insert_rec;
+ el->l_next_free_rec = cpu_to_le16(1);
+ return;
+ }
+
+ /*
+ * Appending insert.
+ */
+ if (insert->ins_appending == APPEND_TAIL) {
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[i];
+ range = le32_to_cpu(rec->e_cpos)
+ + le16_to_cpu(rec->e_leaf_clusters);
+ BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+ mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+ le16_to_cpu(el->l_count),
+ "owner %llu, depth %u, count %u, next free %u, "
+ "rec.cpos %u, rec.clusters %u, "
+ "insert.cpos %u, insert.clusters %u\n",
+ ocfs2_metadata_cache_owner(et->et_ci),
+ le16_to_cpu(el->l_tree_depth),
+ le16_to_cpu(el->l_count),
+ le16_to_cpu(el->l_next_free_rec),
+ le32_to_cpu(el->l_recs[i].e_cpos),
+ le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+ le32_to_cpu(insert_rec->e_cpos),
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ i++;
+ el->l_recs[i] = *insert_rec;
+ le16_add_cpu(&el->l_next_free_rec, 1);
+ return;
+ }
+
+rotate:
+ /*
+ * Ok, we have to rotate.
+ *
+ * At this point, it is safe to assume that inserting into an
+ * empty leaf and appending to a leaf have both been handled
+ * above.
+ *
+ * This leaf needs to have space, either by the empty 1st
+ * extent record, or by virtue of an l_next_rec < l_count.
+ */
+ ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int ret, i, next_free;
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ /*
+ * Update everything except the leaf block.
+ */
+ for (i = 0; i < path->p_tree_depth; i++) {
+ bh = path->p_node[i].bh;
+ el = path->p_node[i].el;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has a bad extent list",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+ ret = -EIO;
+ return;
+ }
+
+ rec = &el->l_recs[next_free - 1];
+
+ rec->e_int_clusters = insert_rec->e_cpos;
+ le32_add_cpu(&rec->e_int_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
+
+ ocfs2_journal_dirty(handle, bh);
+ }
+}
+
+static int ocfs2_append_rec_to_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_path *right_path,
+ struct ocfs2_path **ret_left_path)
+{
+ int ret, next_free;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *left_path = NULL;
+
+ *ret_left_path = NULL;
+
+ /*
+ * This shouldn't happen for non-trees. The extent rec cluster
+ * count manipulation below only works for interior nodes.
+ */
+ BUG_ON(right_path->p_tree_depth == 0);
+
+ /*
+ * If our appending insert is at the leftmost edge of a leaf,
+ * then we might need to update the rightmost records of the
+ * neighboring path.
+ */
+ el = path_leaf_el(right_path);
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0 ||
+ (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+ u32 left_cpos;
+
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ right_path, &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_append_rec_to_path(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ le32_to_cpu(insert_rec->e_cpos),
+ left_cpos);
+
+ /*
+ * No need to worry if the append is already in the
+ * leftmost leaf.
+ */
+ if (left_cpos) {
+ left_path = ocfs2_new_path_from_path(right_path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_insert_path() will pass the left_path to the
+ * journal for us.
+ */
+ }
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
+
+ *ret_left_path = left_path;
+ ret = 0;
+out:
+ if (ret != 0)
+ ocfs2_free_path(left_path);
+
+ return ret;
+}
+
+static void ocfs2_split_record(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ struct ocfs2_extent_rec *split_rec,
+ enum ocfs2_split_type split)
+{
+ int index;
+ u32 cpos = le32_to_cpu(split_rec->e_cpos);
+ struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+ struct ocfs2_extent_rec *rec, *tmprec;
+
+ right_el = path_leaf_el(right_path);
+ if (left_path)
+ left_el = path_leaf_el(left_path);
+
+ el = right_el;
+ insert_el = right_el;
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index != -1) {
+ if (index == 0 && left_path) {
+ BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ /*
+ * This typically means that the record
+ * started in the left path but moved to the
+ * right as a result of rotation. We either
+ * move the existing record to the left, or we
+ * do the later insert there.
+ *
+ * In this case, the left path should always
+ * exist as the rotate code will have passed
+ * it back for a post-insert update.
+ */
+
+ if (split == SPLIT_LEFT) {
+ /*
+ * It's a left split. Since we know
+ * that the rotate code gave us an
+ * empty extent in the left path, we
+ * can just do the insert there.
+ */
+ insert_el = left_el;
+ } else {
+ /*
+ * Right split - we have to move the
+ * existing record over to the left
+ * leaf. The insert will be into the
+ * newly created empty extent in the
+ * right leaf.
+ */
+ tmprec = &right_el->l_recs[index];
+ ocfs2_rotate_leaf(left_el, tmprec);
+ el = left_el;
+
+ memset(tmprec, 0, sizeof(*tmprec));
+ index = ocfs2_search_extent_list(left_el, cpos);
+ BUG_ON(index == -1);
+ }
+ }
+ } else {
+ BUG_ON(!left_path);
+ BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+ /*
+ * Left path is easy - we can just allow the insert to
+ * happen.
+ */
+ el = left_el;
+ insert_el = left_el;
+ index = ocfs2_search_extent_list(el, cpos);
+ BUG_ON(index == -1);
+ }
+
+ rec = &el->l_recs[index];
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ split, rec, split_rec);
+ ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *insert)
+{
+ int ret, subtree_index;
+ struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+
+ if (left_path) {
+ /*
+ * There's a chance that left_path got passed back to
+ * us without being accounted for in the
+ * journal. Extend our transaction here to be sure we
+ * can change those blocks.
+ */
+ ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * Pass both paths to the journal. The majority of inserts
+ * will be touching all components anyway.
+ */
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (insert->ins_split != SPLIT_NONE) {
+ /*
+ * We could call ocfs2_insert_at_leaf() for some types
+ * of splits, but it's easier to just let one separate
+ * function sort it all out.
+ */
+ ocfs2_split_record(et, left_path, right_path,
+ insert_rec, insert->ins_split);
+
+ /*
+ * Split might have modified either leaf and we don't
+ * have a guarantee that the later edge insert will
+ * dirty this for us.
+ */
+ if (left_path)
+ ocfs2_journal_dirty(handle,
+ path_leaf_bh(left_path));
+ } else
+ ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
+ insert);
+
+ ocfs2_journal_dirty(handle, leaf_bh);
+
+ if (left_path) {
+ /*
+ * The rotate code has indicated that we need to fix
+ * up portions of the tree after the insert.
+ *
+ * XXX: Should we extend the transaction here?
+ */
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_do_insert_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_insert_type *type)
+{
+ int ret, rotate = 0;
+ u32 cpos;
+ struct ocfs2_path *right_path = NULL;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el;
+
+ el = et->et_root_el;
+
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ ocfs2_insert_at_leaf(et, insert_rec, el, type);
+ goto out_update_clusters;
+ }
+
+ right_path = ocfs2_new_path_from_et(et);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Determine the path to start with. Rotations need the
+ * rightmost path, everything else can go directly to the
+ * target leaf.
+ */
+ cpos = le32_to_cpu(insert_rec->e_cpos);
+ if (type->ins_appending == APPEND_NONE &&
+ type->ins_contig == CONTIG_NONE) {
+ rotate = 1;
+ cpos = UINT_MAX;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Rotations and appends need special treatment - they modify
+ * parts of the tree's above them.
+ *
+ * Both might pass back a path immediate to the left of the
+ * one being inserted to. This will be cause
+ * ocfs2_insert_path() to modify the rightmost records of
+ * left_path to account for an edge insert.
+ *
+ * XXX: When modifying this code, keep in mind that an insert
+ * can wind up skipping both of these two special cases...
+ */
+ if (rotate) {
+ ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
+ le32_to_cpu(insert_rec->e_cpos),
+ right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_rotate_tree_right() might have extended the
+ * transaction without re-journaling our tree root.
+ */
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (type->ins_appending == APPEND_TAIL
+ && type->ins_contig != CONTIG_LEFT) {
+ ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
+ right_path, &left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_insert_path(handle, et, left_path, right_path,
+ insert_rec, type);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out_update_clusters:
+ if (type->ins_split == SPLIT_NONE)
+ ocfs2_et_update_clusters(et,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+
+ return ret;
+}
+
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_list *el, int index,
+ struct ocfs2_extent_rec *split_rec)
+{
+ int status;
+ enum ocfs2_contig_type ret = CONTIG_NONE;
+ u32 left_cpos, right_cpos;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_list *new_el;
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct buffer_head *bh;
+ struct ocfs2_extent_block *eb;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+
+ if (index > 0) {
+ rec = &el->l_recs[index - 1];
+ } else if (path->p_tree_depth > 0) {
+ status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+ if (status)
+ goto out;
+
+ if (left_cpos != 0) {
+ left_path = ocfs2_new_path_from_path(path);
+ if (!left_path)
+ goto out;
+
+ status = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
+ if (status)
+ goto out;
+
+ new_el = path_leaf_el(left_path);
+
+ if (le16_to_cpu(new_el->l_next_free_rec) !=
+ le16_to_cpu(new_el->l_count)) {
+ bh = path_leaf_bh(left_path);
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ ocfs2_error(sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of "
+ "%d. It should have "
+ "matched the l_count of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
+ status = -EINVAL;
+ goto out;
+ }
+ rec = &new_el->l_recs[
+ le16_to_cpu(new_el->l_next_free_rec) - 1];
+ }
+ }
+
+ /*
+ * We're careful to check for an empty extent record here -
+ * the merge code will know what to do if it sees one.
+ */
+ if (rec) {
+ if (index == 1 && ocfs2_is_empty_extent(rec)) {
+ if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+ ret = CONTIG_RIGHT;
+ } else {
+ ret = ocfs2_et_extent_contig(et, rec, split_rec);
+ }
+ }
+
+ rec = NULL;
+ if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+ rec = &el->l_recs[index + 1];
+ else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+ path->p_tree_depth > 0) {
+ status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
+ if (status)
+ goto out;
+
+ if (right_cpos == 0)
+ goto out;
+
+ right_path = ocfs2_new_path_from_path(path);
+ if (!right_path)
+ goto out;
+
+ status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
+ if (status)
+ goto out;
+
+ new_el = path_leaf_el(right_path);
+ rec = &new_el->l_recs[0];
+ if (ocfs2_is_empty_extent(rec)) {
+ if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+ bh = path_leaf_bh(right_path);
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ ocfs2_error(sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
+ status = -EINVAL;
+ goto out;
+ }
+ rec = &new_el->l_recs[1];
+ }
+ }
+
+ if (rec) {
+ enum ocfs2_contig_type contig_type;
+
+ contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
+
+ if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+ ret = CONTIG_LEFTRIGHT;
+ else if (ret == CONTIG_NONE)
+ ret = contig_type;
+ }
+
+out:
+ if (left_path)
+ ocfs2_free_path(left_path);
+ if (right_path)
+ ocfs2_free_path(right_path);
+
+ return ret;
+}
+
+static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
+ struct ocfs2_insert_type *insert,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i;
+ enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+ insert_rec);
+ if (contig_type != CONTIG_NONE) {
+ insert->ins_contig_index = i;
+ break;
+ }
+ }
+ insert->ins_contig = contig_type;
+
+ if (insert->ins_contig != CONTIG_NONE) {
+ struct ocfs2_extent_rec *rec =
+ &el->l_recs[insert->ins_contig_index];
+ unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+ le16_to_cpu(insert_rec->e_leaf_clusters);
+
+ /*
+ * Caller might want us to limit the size of extents, don't
+ * calculate contiguousness if we might exceed that limit.
+ */
+ if (et->et_max_leaf_clusters &&
+ (len > et->et_max_leaf_clusters))
+ insert->ins_contig = CONTIG_NONE;
+ }
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int i;
+ u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+ struct ocfs2_extent_rec *rec;
+
+ insert->ins_appending = APPEND_NONE;
+
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+ if (!el->l_next_free_rec)
+ goto set_tail_append;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ /* Were all records empty? */
+ if (le16_to_cpu(el->l_next_free_rec) == 1)
+ goto set_tail_append;
}
- /* Can we allocate without adding/shifting tree bits? */
i = le16_to_cpu(el->l_next_free_rec) - 1;
- if (le16_to_cpu(el->l_next_free_rec) == 0
- || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
- || le32_to_cpu(el->l_recs[i].e_clusters) == 0
- || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
- goto out_add;
+ rec = &el->l_recs[i];
- mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
- "tree now.\n");
+ if (cpos >=
+ (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+ goto set_tail_append;
- shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
- if (shift < 0) {
- status = shift;
+ return;
+
+set_tail_append:
+ insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the beginning of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ * - Whether the new extent is contiguous with an existing one.
+ * - The current tree depth.
+ * - Whether the insert is an appending one.
+ * - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
+ struct buffer_head **last_eb_bh,
+ struct ocfs2_extent_rec *insert_rec,
+ int *free_records,
+ struct ocfs2_insert_type *insert)
+{
+ int ret;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *path = NULL;
+ struct buffer_head *bh = NULL;
+
+ insert->ins_split = SPLIT_NONE;
+
+ el = et->et_root_el;
+ insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+ if (el->l_tree_depth) {
+ /*
+ * If we have tree depth, we read in the
+ * rightmost extent block ahead of time as
+ * ocfs2_figure_insert_type() and ocfs2_add_branch()
+ * may want it later.
+ */
+ ret = ocfs2_read_extent_block(et->et_ci,
+ ocfs2_et_get_last_eb_blk(et),
+ &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ el = &eb->h_list;
+ }
+
+ /*
+ * Unless we have a contiguous insert, we'll need to know if
+ * there is room left in our allocation tree for another
+ * extent record.
+ *
+ * XXX: This test is simplistic, we can search for empty
+ * extent records too.
+ */
+ *free_records = le16_to_cpu(el->l_count) -
+ le16_to_cpu(el->l_next_free_rec);
+
+ if (!insert->ins_tree_depth) {
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
+ ocfs2_figure_appending_type(insert, el, insert_rec);
+ return 0;
+ }
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * In the case that we're inserting past what the tree
+ * currently accounts for, ocfs2_find_path() will return for
+ * us the rightmost tree path. This is accounted for below in
+ * the appending code.
+ */
+ ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ /*
+ * Now that we have the path, there's two things we want to determine:
+ * 1) Contiguousness (also set contig_index if this is so)
+ *
+ * 2) Are we doing an append? We can trivially break this up
+ * into two types of appends: simple record append, or a
+ * rotate inside the tail leaf.
+ */
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
+
+ /*
+ * The insert code isn't quite ready to deal with all cases of
+ * left contiguousness. Specifically, if it's an insert into
+ * the 1st record in a leaf, it will require the adjustment of
+ * cluster count on the last record of the path directly to it's
+ * left. For now, just catch that case and fool the layers
+ * above us. This works just fine for tree_depth == 0, which
+ * is why we allow that above.
+ */
+ if (insert->ins_contig == CONTIG_LEFT &&
+ insert->ins_contig_index == 0)
+ insert->ins_contig = CONTIG_NONE;
+
+ /*
+ * Ok, so we can simply compare against last_eb to figure out
+ * whether the path doesn't exist. This will only happen in
+ * the case that we're doing a tail append, so maybe we can
+ * take advantage of that information somehow.
+ */
+ if (ocfs2_et_get_last_eb_blk(et) ==
+ path_leaf_bh(path)->b_blocknr) {
+ /*
+ * Ok, ocfs2_find_path() returned us the rightmost
+ * tree path. This might be an appending insert. There are
+ * two cases:
+ * 1) We're doing a true append at the tail:
+ * -This might even be off the end of the leaf
+ * 2) We're "appending" by rotating in the tail
+ */
+ ocfs2_figure_appending_type(insert, el, insert_rec);
+ }
+
+out:
+ ocfs2_free_path(path);
+
+ if (ret == 0)
+ *last_eb_bh = bh;
+ else
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * Insert an extent into a btree.
+ *
+ * The caller needs to update the owning btree's cluster count.
+ */
+int ocfs2_insert_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos,
+ u64 start_blk,
+ u32 new_clusters,
+ u8 flags,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int status;
+ int uninitialized_var(free_records);
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_insert_type insert = {0, };
+ struct ocfs2_extent_rec rec;
+
+ trace_ocfs2_insert_extent_start(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, new_clusters);
+
+ memset(&rec, 0, sizeof(rec));
+ rec.e_cpos = cpu_to_le32(cpos);
+ rec.e_blkno = cpu_to_le64(start_blk);
+ rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+ rec.e_flags = flags;
+ status = ocfs2_et_insert_check(et, &rec);
+ if (status) {
mlog_errno(status);
goto bail;
}
- /* We traveled all the way to the bottom of the allocation tree
- * and didn't find room for any more extents - we need to add
- * another tree level */
- if (shift) {
- /* if we hit a leaf, we'd better be empty :) */
- BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
- le16_to_cpu(el->l_count));
- BUG_ON(bh);
- mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
- "(current = %u)\n",
- le16_to_cpu(fe->id2.i_list.l_tree_depth));
+ status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
+ &free_records, &insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
- /* ocfs2_shift_tree_depth will return us a buffer with
- * the new extent block (so we can pass that to
- * ocfs2_add_branch). */
- status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
- meta_ac, &bh);
- if (status < 0) {
+ trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+ insert.ins_contig_index, free_records,
+ insert.ins_tree_depth);
+
+ if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
+ status = ocfs2_grow_tree(handle, et,
+ &insert.ins_tree_depth, &last_eb_bh,
+ meta_ac);
+ if (status) {
mlog_errno(status);
goto bail;
}
- /* Special case: we have room now if we shifted from
- * tree_depth 0 */
- if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
- goto out_add;
}
- /* call ocfs2_add_branch to add the final part of the tree with
- * the new data. */
- mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
- status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
- meta_ac);
+ /* Finally, we can add clusters. This might rotate the tree for us. */
+ status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
+ if (status < 0)
+ mlog_errno(status);
+ else
+ ocfs2_et_extent_map_insert(et, &rec);
+
+bail:
+ brelse(last_eb_bh);
+
+ return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
+{
+ int status = 0, err = 0;
+ int need_free = 0;
+ int free_extents;
+ enum ocfs2_alloc_restarted reason = RESTART_NONE;
+ u32 bit_off, num_bits;
+ u64 block;
+ u8 flags = 0;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+ BUG_ON(!clusters_to_add);
+
+ if (mark_unwritten)
+ flags = OCFS2_EXT_UNWRITTEN;
+
+ free_extents = ocfs2_num_free_extents(osb, et);
+ if (free_extents < 0) {
+ status = free_extents;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* there are two cases which could cause us to EAGAIN in the
+ * we-need-more-metadata case:
+ * 1) we haven't reserved *any*
+ * 2) we are so fragmented, we've needed to add metadata too
+ * many times. */
+ if (!free_extents && !meta_ac) {
+ err = -1;
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ } else if ((!free_extents)
+ && (ocfs2_alloc_context_bits_left(meta_ac)
+ < ocfs2_extend_meta_needed(et->et_root_el))) {
+ err = -2;
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ }
+
+ status = __ocfs2_claim_clusters(handle, data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ BUG_ON(num_bits > clusters_to_add);
+
+ /* reserve our write early -- insert_extent may update the tree root */
+ status = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
+ need_free = 1;
goto bail;
}
-out_add:
- /* Finally, we can add clusters. */
- status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
- start_blk, new_clusters);
- if (status < 0)
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_add_clusters_in_btree(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ bit_off, num_bits);
+ status = ocfs2_insert_extent(handle, et, *logical_offset, block,
+ num_bits, flags, meta_ac);
+ if (status < 0) {
mlog_errno(status);
+ need_free = 1;
+ goto bail;
+ }
-bail:
- if (bh)
- brelse(bh);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+
+ clusters_to_add -= num_bits;
+ *logical_offset += num_bits;
+
+ if (clusters_to_add) {
+ err = clusters_to_add;
+ status = -EAGAIN;
+ reason = RESTART_TRANS;
+ }
- if (last_eb_bh)
- brelse(last_eb_bh);
+bail:
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num_bits);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num_bits);
+ }
- mlog_exit(status);
+leave:
+ if (reason_ret)
+ *reason_ret = reason;
+ trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
return status;
}
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+ struct ocfs2_extent_rec *split_rec,
+ u32 cpos,
+ struct ocfs2_extent_rec *rec)
+{
+ u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+ u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+ memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+ split_rec->e_cpos = cpu_to_le32(cpos);
+ split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+ split_rec->e_blkno = rec->e_blkno;
+ le64_add_cpu(&split_rec->e_blkno,
+ ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+ split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct buffer_head **last_eb_bh,
+ int split_index,
+ struct ocfs2_extent_rec *orig_split_rec,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret = 0, depth;
+ unsigned int insert_range, rec_range, do_leftright = 0;
+ struct ocfs2_extent_rec tmprec;
+ struct ocfs2_extent_list *rightmost_el;
+ struct ocfs2_extent_rec rec;
+ struct ocfs2_extent_rec split_rec = *orig_split_rec;
+ struct ocfs2_insert_type insert;
+ struct ocfs2_extent_block *eb;
+
+leftright:
+ /*
+ * Store a copy of the record on the stack - it might move
+ * around as the tree is manipulated below.
+ */
+ rec = path_leaf_el(path)->l_recs[split_index];
+
+ rightmost_el = et->et_root_el;
+
+ depth = le16_to_cpu(rightmost_el->l_tree_depth);
+ if (depth) {
+ BUG_ON(!(*last_eb_bh));
+ eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+ rightmost_el = &eb->h_list;
+ }
+
+ if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+ le16_to_cpu(rightmost_el->l_count)) {
+ ret = ocfs2_grow_tree(handle, et,
+ &depth, last_eb_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+ insert.ins_appending = APPEND_NONE;
+ insert.ins_contig = CONTIG_NONE;
+ insert.ins_tree_depth = depth;
+
+ insert_range = le32_to_cpu(split_rec.e_cpos) +
+ le16_to_cpu(split_rec.e_leaf_clusters);
+ rec_range = le32_to_cpu(rec.e_cpos) +
+ le16_to_cpu(rec.e_leaf_clusters);
+
+ if (split_rec.e_cpos == rec.e_cpos) {
+ insert.ins_split = SPLIT_LEFT;
+ } else if (insert_range == rec_range) {
+ insert.ins_split = SPLIT_RIGHT;
+ } else {
+ /*
+ * Left/right split. We fake this as a right split
+ * first and then make a second pass as a left split.
+ */
+ insert.ins_split = SPLIT_RIGHT;
+
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &tmprec, insert_range, &rec);
+
+ split_rec = tmprec;
+
+ BUG_ON(do_leftright);
+ do_leftright = 1;
+ }
+
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (do_leftright == 1) {
+ u32 cpos;
+ struct ocfs2_extent_list *el;
+
+ do_leftright++;
+ split_rec = *orig_split_rec;
+
+ ocfs2_reinit_path(path, 1);
+
+ cpos = le32_to_cpu(split_rec.e_cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ split_index = ocfs2_search_extent_list(el, cpos);
+ goto leftright;
+ }
+out:
+
+ return ret;
+}
+
+static int ocfs2_replace_extent_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_list *el,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec)
+{
+ int ret;
+
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
+ path_num_items(path) - 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el->l_recs[split_index] = *split_rec;
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+ return ret;
+}
+
+/*
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+ struct ocfs2_merge_ctxt ctxt;
+ struct ocfs2_extent_list *rightmost_el;
+
+ if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+ ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+ (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
+ split_index,
+ split_rec);
+
+ /*
+ * The core merge / split code wants to know how much room is
+ * left in this allocation tree, so we pass the
+ * rightmost extent list.
+ */
+ if (path->p_tree_depth) {
+ struct ocfs2_extent_block *eb;
+
+ ret = ocfs2_read_extent_block(et->et_ci,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ rightmost_el = &eb->h_list;
+ } else
+ rightmost_el = path_root_el(path);
+
+ if (rec->e_cpos == split_rec->e_cpos &&
+ rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+ ctxt.c_split_covers_rec = 1;
+ else
+ ctxt.c_split_covers_rec = 0;
+
+ ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+ trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+ ctxt.c_has_empty_extent,
+ ctxt.c_split_covers_rec);
+
+ if (ctxt.c_contig_type == CONTIG_NONE) {
+ if (ctxt.c_split_covers_rec)
+ ret = ocfs2_replace_extent_rec(handle, et, path, el,
+ split_index, split_rec);
+ else
+ ret = ocfs2_split_and_insert(handle, et, path,
+ &last_eb_bh, split_index,
+ split_rec, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ ret = ocfs2_try_to_merge_extent(handle, et, path,
+ split_index, split_rec,
+ dealloc, &ctxt);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ brelse(last_eb_bh);
+ return ret;
+}
+
+/*
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags)
+{
+ int ret, index;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
+ struct ocfs2_extent_rec split_rec;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ left_path = ocfs2_new_path_from_et(et);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ el = path_leaf_el(left_path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(sb,
+ "Owner %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = -EIO;
+ rec = &el->l_recs[index];
+ if (new_flags && (rec->e_flags & new_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+ "extent that already had them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ new_flags);
+ goto out;
+ }
+
+ if (clear_flags && !(rec->e_flags & clear_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+ "extent that didn't have them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ clear_flags);
+ goto out;
+ }
+
+ memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+ split_rec.e_cpos = cpu_to_le32(cpos);
+ split_rec.e_leaf_clusters = cpu_to_le16(len);
+ split_rec.e_blkno = cpu_to_le64(start_blkno);
+ split_rec.e_flags = rec->e_flags;
+ if (new_flags)
+ split_rec.e_flags |= new_flags;
+ if (clear_flags)
+ split_rec.e_flags &= ~clear_flags;
+
+ ret = ocfs2_split_extent(handle, et, left_path,
+ index, &split_rec, meta_ac,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ trace_ocfs2_mark_extent_written(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ cpos, len, phys);
+
+ if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+ "that are being written to, but the feature bit "
+ "is not set in the super block.",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * XXX: This should be fixed up so that we just re-insert the
+ * next extent records.
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ 0, OCFS2_EXT_UNWRITTEN);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int index, u32 new_range,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret, depth, credits;
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *rightmost_el, *el;
+ struct ocfs2_extent_rec split_rec;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_insert_type insert;
+
+ /*
+ * Setup the record to split before we grow the tree.
+ */
+ el = path_leaf_el(path);
+ rec = &el->l_recs[index];
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &split_rec, new_range, rec);
+
+ depth = path->p_tree_depth;
+ if (depth > 0) {
+ ret = ocfs2_read_extent_block(et->et_ci,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ rightmost_el = &eb->h_list;
+ } else
+ rightmost_el = path_leaf_el(path);
+
+ credits = path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+ le16_to_cpu(rightmost_el->l_count)) {
+ ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+ insert.ins_appending = APPEND_NONE;
+ insert.ins_contig = CONTIG_NONE;
+ insert.ins_split = SPLIT_RIGHT;
+ insert.ins_tree_depth = depth;
+
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(last_eb_bh);
+ return ret;
+}
+
+static int ocfs2_truncate_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path, int index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u32 cpos, u32 len)
+{
+ int ret;
+ u32 left_cpos, rec_range, trunc_range;
+ int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_block *eb;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ index--;
+ }
+
+ if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+ path->p_tree_depth) {
+ /*
+ * Check whether this is the rightmost tree record. If
+ * we remove all of this record or part of its right
+ * edge then an update of the record lengths above it
+ * will be required.
+ */
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ if (eb->h_next_leaf_blk == 0)
+ is_rightmost_tree_rec = 1;
+ }
+
+ rec = &el->l_recs[index];
+ if (index == 0 && path->p_tree_depth &&
+ le32_to_cpu(rec->e_cpos) == cpos) {
+ /*
+ * Changing the leftmost offset (via partial or whole
+ * record truncate) of an interior (or rightmost) path
+ * means we have to update the subtree that is formed
+ * by this leaf and the one to it's left.
+ *
+ * There are two cases we can skip:
+ * 1) Path is the leftmost one in our btree.
+ * 2) The leaf is rightmost and will be empty after
+ * we remove the extent record - the rotate code
+ * knows how to update the newly formed edge.
+ */
+
+ ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+ left_path = ocfs2_new_path_from_path(path);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ }
+
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ trunc_range = cpos + len;
+
+ if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+ int next_free;
+
+ memset(rec, 0, sizeof(*rec));
+ ocfs2_cleanup_merge(el, index);
+ wants_rotate = 1;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (is_rightmost_tree_rec && next_free > 1) {
+ /*
+ * We skip the edge update if this path will
+ * be deleted by the rotate code.
+ */
+ rec = &el->l_recs[next_free - 1];
+ ocfs2_adjust_rightmost_records(handle, et, path,
+ rec);
+ }
+ } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+ /* Remove leftmost portion of the record. */
+ le32_add_cpu(&rec->e_cpos, len);
+ le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+ le16_add_cpu(&rec->e_leaf_clusters, -len);
+ } else if (rec_range == trunc_range) {
+ /* Remove rightmost portion of the record */
+ le16_add_cpu(&rec->e_leaf_clusters, -len);
+ if (is_rightmost_tree_rec)
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
+ } else {
+ /* Caller should have trapped this. */
+ mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
+ "(%u, %u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ le32_to_cpu(rec->e_cpos),
+ le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+ BUG();
+ }
+
+ if (left_path) {
+ int subtree_index;
+
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+ ocfs2_complete_edge_insert(handle, left_path, path,
+ subtree_index);
+ }
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+int ocfs2_remove_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ u32 rec_range, trunc_range;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *path = NULL;
+
+ /*
+ * XXX: Why are we truncating to 0 instead of wherever this
+ * affects us?
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * We have 3 cases of extent removal:
+ * 1) Range covers the entire extent rec
+ * 2) Range begins or ends on one edge of the extent rec
+ * 3) Range is in the middle of the extent rec (no shared edges)
+ *
+ * For case 1 we remove the extent rec and left rotate to
+ * fill the hole.
+ *
+ * For case 2 we just shrink the existing extent rec, with a
+ * tree update if the shrinking edge is also the edge of an
+ * extent block.
+ *
+ * For case 3 we do a right split to turn the extent rec into
+ * something case 2 can handle.
+ */
+ rec = &el->l_recs[index];
+ rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ trunc_range = cpos + len;
+
+ BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+
+ trace_ocfs2_remove_extent(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, index, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+
+ if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ ret = ocfs2_split_tree(handle, et, path, index,
+ trunc_range, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * The split could have manipulated the tree enough to
+ * move the record location, so we have to look for it again.
+ */
+ ocfs2_reinit_path(path, 1);
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: split at cpos %u lost record.",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * Double check our values here. If anything is fishy,
+ * it's easier to catch it at the top level.
+ */
+ rec = &el->l_recs[index];
+ rec_range = le32_to_cpu(rec->e_cpos) +
+ ocfs2_rec_clusters(el, rec);
+ if (rec_range != trunc_range) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: error after split at cpos %u"
+ "trunc len %u, existing record is (%u,%u)",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **ac,
+ int extra_blocks)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *ac = NULL;
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ if (extra_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ if (ret) {
+ if (*ac) {
+ ocfs2_free_alloc_context(*ac);
+ *ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc)
+{
+ int ret, credits = 0, extra_blocks = 0;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+
+ if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+ }
+
+ ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+ extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb,
+ ocfs2_remove_extent_credits(osb->sb) + credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_et_root_journal_access(handle, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ dquot_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(inode->i_sb, len));
+
+ ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_et_update_clusters(et, -len);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+
+ if (phys_blkno) {
+ if (flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ phys_blkno),
+ len, meta_ac,
+ dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+bail:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
{
struct buffer_head *tl_bh = osb->osb_tl_bh;
struct ocfs2_dinode *di;
@@ -950,10 +5794,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
return current_tail == new_start;
}
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- u64 start_blk,
- unsigned int num_clusters)
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+ handle_t *handle,
+ u64 start_blk,
+ unsigned int num_clusters)
{
int status, index;
unsigned int start_cluster, tl_count;
@@ -962,21 +5806,18 @@ static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- mlog_entry("start_blk = %llu, num_clusters = %u\n",
- (unsigned long long)start_blk, num_clusters);
-
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
tl_count = le16_to_cpu(tl->tl_count);
mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
tl_count == 0,
@@ -994,17 +5835,16 @@ static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, tl_inode, tl_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Log truncate of %u clusters starting at cluster %u to "
- "%llu (index = %d)\n", num_clusters, start_cluster,
- (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
-
+ trace_ocfs2_truncate_log_append(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+ start_cluster, num_clusters);
if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
/*
* Move index back to the record we are coalescing with.
@@ -1013,28 +5853,25 @@ static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
index--;
num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
- mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
- index, le32_to_cpu(tl->tl_recs[index].t_start),
- num_clusters);
+ trace_ocfs2_truncate_log_append(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ index, le32_to_cpu(tl->tl_recs[index].t_start),
+ num_clusters);
} else {
tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
tl->tl_used = cpu_to_le16(index + 1);
}
tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
+ osb->truncated_clusters += num_clusters;
bail:
- mlog_exit(status);
return status;
}
static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct inode *data_alloc_inode,
struct buffer_head *data_alloc_bh)
{
@@ -1048,16 +5885,14 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
struct inode *tl_inode = osb->osb_tl_inode;
struct buffer_head *tl_bh = osb->osb_tl_bh;
- mlog_entry_void();
-
di = (struct ocfs2_dinode *) tl_bh->b_data;
tl = &di->id2.i_dealloc;
i = le16_to_cpu(tl->tl_used) - 1;
while (i >= 0) {
/* Caller has given us at least enough credits to
* update the truncate log dinode */
- status = ocfs2_journal_access(handle, tl_inode, tl_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1065,11 +5900,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
tl->tl_used = cpu_to_le16(i);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
/* TODO: Perhaps we can calculate the bulk of the
* credits up front rather than extending like
@@ -1089,8 +5920,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
/* if start_blk is not set, we ignore the record as
* invalid. */
if (start_blk) {
- mlog(0, "free record %d, start = %u, clusters = %u\n",
- i, le32_to_cpu(rec.t_start), num_clusters);
+ trace_ocfs2_replay_truncate_records(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ i, le32_to_cpu(rec.t_start), num_clusters);
status = ocfs2_free_clusters(handle, data_alloc_inode,
data_alloc_bh, start_blk,
@@ -1103,17 +5935,18 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
i--;
}
+ osb->truncated_clusters = 0;
+
bail:
- mlog_exit(status);
return status;
}
/* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
unsigned int num_to_flush;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle;
struct inode *tl_inode = osb->osb_tl_inode;
struct inode *data_alloc_inode = NULL;
struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -1121,31 +5954,23 @@ static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- mlog_entry_void();
-
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
num_to_flush = le16_to_cpu(tl->tl_used);
- mlog(0, "Flush %u records from truncate log #%llu\n",
- num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+ trace_ocfs2_flush_truncate_log(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ num_to_flush);
if (!num_to_flush) {
status = 0;
- goto bail;
- }
-
- handle = ocfs2_alloc_handle(osb);
- if (!handle) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
+ goto out;
}
data_alloc_inode = ocfs2_get_system_file_inode(osb,
@@ -1154,42 +5979,40 @@ static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
if (!data_alloc_inode) {
status = -EINVAL;
mlog(ML_ERROR, "Could not get bitmap inode!\n");
- goto bail;
+ goto out;
}
- ocfs2_handle_add_inode(handle, data_alloc_inode);
- status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+ mutex_lock(&data_alloc_inode->i_mutex);
+
+ status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_mutex;
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
- handle = NULL;
mlog_errno(status);
- goto bail;
+ goto out_unlock;
}
status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
data_alloc_bh);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto bail;
- }
-bail:
- if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
- if (data_alloc_inode)
- iput(data_alloc_inode);
+out_unlock:
+ brelse(data_alloc_bh);
+ ocfs2_inode_unlock(data_alloc_inode, 1);
- if (data_alloc_bh)
- brelse(data_alloc_bh);
+out_mutex:
+ mutex_unlock(&data_alloc_inode->i_mutex);
+ iput(data_alloc_inode);
- mlog_exit(status);
+out:
return status;
}
@@ -1205,25 +6028,26 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
return status;
}
-static void ocfs2_truncate_log_worker(void *data)
+static void ocfs2_truncate_log_worker(struct work_struct *work)
{
int status;
- struct ocfs2_super *osb = data;
-
- mlog_entry_void();
+ struct ocfs2_super *osb =
+ container_of(work, struct ocfs2_super,
+ osb_truncate_log_wq.work);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
mlog_errno(status);
-
- mlog_exit(status);
+ else
+ ocfs2_init_steal_slots(osb);
}
#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel)
{
- if (osb->osb_tl_inode) {
+ if (osb->osb_tl_inode &&
+ atomic_read(&osb->osb_tl_disable) == 0) {
/* We want to push off log flushes while truncates are
* still running. */
if (cancel)
@@ -1252,8 +6076,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
iput(inode);
mlog_errno(status);
@@ -1263,7 +6086,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
*tl_inode = inode;
*tl_bh = bh;
bail:
- mlog_exit(status);
return status;
}
@@ -1283,7 +6105,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
*tl_copy = NULL;
- mlog(0, "recover truncate log from slot %d\n", slot_num);
+ trace_ocfs2_begin_truncate_log_recovery(slot_num);
status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
if (status < 0) {
@@ -1292,16 +6114,15 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's
+ * validated by the underlying call to ocfs2_read_inode_block(),
+ * so any corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
if (le16_to_cpu(tl->tl_used)) {
- mlog(0, "We'll have %u logs to recover\n",
- le16_to_cpu(tl->tl_used));
+ trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
if (!(*tl_copy)) {
@@ -1318,7 +6139,8 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
* tl_used. */
tl->tl_used = 0;
- status = ocfs2_write_block(osb, tl_bh, tl_inode);
+ ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
+ status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1328,15 +6150,14 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
bail:
if (tl_inode)
iput(tl_inode);
- if (tl_bh)
- brelse(tl_bh);
+ brelse(tl_bh);
if (status < 0 && (*tl_copy)) {
kfree(*tl_copy);
*tl_copy = NULL;
+ mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -1347,12 +6168,10 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
int i;
unsigned int clusters, num_recs, start_cluster;
u64 start_blk;
- struct ocfs2_journal_handle *handle;
+ handle_t *handle;
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_truncate_log *tl;
- mlog_entry_void();
-
if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
return -EINVAL;
@@ -1360,8 +6179,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
tl = &tl_copy->id2.i_dealloc;
num_recs = le16_to_cpu(tl->tl_used);
- mlog(0, "cleanup %u records from %llu\n", num_recs,
- (unsigned long long)tl_copy->i_blkno);
+ trace_ocfs2_complete_truncate_log_recovery(
+ (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+ num_recs);
mutex_lock(&tl_inode->i_mutex);
for(i = 0; i < num_recs; i++) {
@@ -1373,8 +6193,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
}
}
- handle = ocfs2_start_trans(osb, NULL,
- OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -1387,7 +6206,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
status = ocfs2_truncate_log_append(osb, handle,
start_blk, clusters);
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
if (status < 0) {
mlog_errno(status);
goto bail_up;
@@ -1397,7 +6216,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
bail_up:
mutex_unlock(&tl_inode->i_mutex);
- mlog_exit(status);
return status;
}
@@ -1406,7 +6224,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
- mlog_entry_void();
+ atomic_set(&osb->osb_tl_disable, 1);
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
@@ -1419,8 +6237,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
brelse(osb->osb_tl_bh);
iput(osb->osb_tl_inode);
}
-
- mlog_exit_void();
}
int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -1429,8 +6245,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
struct inode *tl_inode = NULL;
struct buffer_head *tl_bh = NULL;
- mlog_entry_void();
-
status = ocfs2_get_truncate_log_info(osb,
osb->slot_num,
&tl_inode,
@@ -1441,327 +6255,740 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
/* ocfs2_truncate_log_shutdown keys on the existence of
* osb->osb_tl_inode so we don't set any of the osb variables
* until we're sure all is well. */
- INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
+ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
+ ocfs2_truncate_log_worker);
+ atomic_set(&osb->osb_tl_disable, 0);
osb->osb_tl_bh = tl_bh;
osb->osb_tl_inode = tl_inode;
- mlog_exit(status);
return status;
}
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_dinode *fe,
- u32 new_i_clusters,
- struct buffer_head *old_last_eb,
- struct buffer_head **new_last_eb)
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+
+/*
+ * Describe a single bit freed from a suballocator. For the block
+ * suballocators, it represents one block. For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
+ */
+struct ocfs2_cached_block_free {
+ struct ocfs2_cached_block_free *free_next;
+ u64 free_bg;
+ u64 free_blk;
+ unsigned int free_bit;
+};
+
+struct ocfs2_per_slot_free_list {
+ struct ocfs2_per_slot_free_list *f_next_suballocator;
+ int f_inode_type;
+ int f_slot;
+ struct ocfs2_cached_block_free *f_first;
+};
+
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+ int sysfile_type,
+ int slot,
+ struct ocfs2_cached_block_free *head)
{
- int i, status = 0;
- u64 block = 0;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
- struct buffer_head *bh = NULL;
+ int ret;
+ u64 bg_blkno;
+ handle_t *handle;
+ struct inode *inode;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_cached_block_free *tmp;
+
+ inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+ if (!inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
- *new_last_eb = NULL;
+ mutex_lock(&inode->i_mutex);
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail;
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_mutex;
}
- /* we have no tree, so of course, no last_eb. */
- if (!fe->id2.i_list.l_tree_depth)
- goto bail;
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
- /* trunc to zero special case - this makes tree_depth = 0
- * regardless of what it is. */
- if (!new_i_clusters)
- goto bail;
+ while (head) {
+ if (head->free_bg)
+ bg_blkno = head->free_bg;
+ else
+ bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+ head->free_bit);
+ trace_ocfs2_free_cached_blocks(
+ (unsigned long long)head->free_blk, head->free_bit);
+
+ ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+ head->free_bit, bg_blkno, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_journal;
+ }
+
+ ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_journal;
+ }
- eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
- el = &(eb->h_list);
- BUG_ON(!el->l_next_free_rec);
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
- /* Make sure that this guy will actually be empty after we
- * clear away the data. */
- if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
- goto bail;
+out_journal:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
+out:
+ while(head) {
+ /* Premature exit may have left some dangling items. */
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
- /* Ok, at this point, we know that last_eb will definitely
- * change, so lets traverse the tree and find the second to
- * last extent block. */
- el = &(fe->id2.i_list);
- /* go down the tree, */
- do {
- for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
- if (le32_to_cpu(el->l_recs[i].e_cpos) <
- new_i_clusters) {
- block = le64_to_cpu(el->l_recs[i].e_blkno);
+ return ret;
+}
+
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ u64 blkno, unsigned int bit)
+{
+ int ret = 0;
+ struct ocfs2_cached_block_free *item;
+
+ item = kzalloc(sizeof(*item), GFP_NOFS);
+ if (item == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+
+ trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
+
+ item->free_blk = blkno;
+ item->free_bit = bit;
+ item->free_next = ctxt->c_global_allocator;
+
+ ctxt->c_global_allocator = item;
+ return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+ struct ocfs2_cached_block_free *head)
+{
+ struct ocfs2_cached_block_free *tmp;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ int ret = 0;
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ while (head) {
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
break;
}
}
- BUG_ON(i < 0);
- if (bh) {
- brelse(bh);
- bh = NULL;
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ break;
}
- status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
- inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+ head->free_bit);
+
+ ocfs2_commit_trans(osb, handle);
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
}
- eb = (struct ocfs2_extent_block *) bh->b_data;
- el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
+ }
+
+ mutex_unlock(&tl_inode->i_mutex);
+
+ while (head) {
+ /* Premature exit may have left some dangling items. */
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
+
+ return ret;
+}
+
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+ struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+ int ret = 0, ret2;
+ struct ocfs2_per_slot_free_list *fl;
+
+ if (!ctxt)
+ return 0;
+
+ while (ctxt->c_first_suballocator) {
+ fl = ctxt->c_first_suballocator;
+
+ if (fl->f_first) {
+ trace_ocfs2_run_deallocs(fl->f_inode_type,
+ fl->f_slot);
+ ret2 = ocfs2_free_cached_blocks(osb,
+ fl->f_inode_type,
+ fl->f_slot,
+ fl->f_first);
+ if (ret2)
+ mlog_errno(ret2);
+ if (!ret)
+ ret = ret2;
}
- } while (el->l_tree_depth);
- *new_last_eb = bh;
- get_bh(*new_last_eb);
- mlog(0, "returning block %llu\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno));
-bail:
- if (bh)
- brelse(bh);
+ ctxt->c_first_suballocator = fl->f_next_suballocator;
+ kfree(fl);
+ }
- return status;
+ if (ctxt->c_global_allocator) {
+ ret2 = ocfs2_free_cached_clusters(osb,
+ ctxt->c_global_allocator);
+ if (ret2)
+ mlog_errno(ret2);
+ if (!ret)
+ ret = ret2;
+
+ ctxt->c_global_allocator = NULL;
+ }
+
+ return ret;
}
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
- unsigned int clusters_to_del,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct buffer_head *old_last_eb_bh,
- struct ocfs2_journal_handle *handle,
- struct ocfs2_truncate_context *tc)
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+ int slot,
+ struct ocfs2_cached_dealloc_ctxt *ctxt)
{
- int status, i, depth;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_block *last_eb = NULL;
- struct ocfs2_extent_list *el;
- struct buffer_head *eb_bh = NULL;
- struct buffer_head *last_eb_bh = NULL;
- u64 next_eb = 0;
- u64 delete_blk = 0;
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- status = ocfs2_find_new_last_ext_blk(osb,
- inode,
- fe,
- le32_to_cpu(fe->i_clusters) -
- clusters_to_del,
- old_last_eb_bh,
- &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+ while (fl) {
+ if (fl->f_inode_type == type && fl->f_slot == slot)
+ return fl;
+
+ fl = fl->f_next_suballocator;
}
- if (last_eb_bh)
- last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ fl = kmalloc(sizeof(*fl), GFP_NOFS);
+ if (fl) {
+ fl->f_inode_type = type;
+ fl->f_slot = slot;
+ fl->f_first = NULL;
+ fl->f_next_suballocator = ctxt->c_first_suballocator;
+
+ ctxt->c_first_suballocator = fl;
}
- el = &(fe->id2.i_list);
+ return fl;
+}
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
- clusters_to_del;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- le32_add_cpu(&fe->i_clusters, -clusters_to_del);
- fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
- fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 suballoc,
+ u64 blkno, unsigned int bit)
+{
+ int ret;
+ struct ocfs2_per_slot_free_list *fl;
+ struct ocfs2_cached_block_free *item;
+
+ fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+ if (fl == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
- i = le16_to_cpu(el->l_next_free_rec) - 1;
+ item = kzalloc(sizeof(*item), GFP_NOFS);
+ if (item == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
- BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
- le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
- /* tree depth zero, we can just delete the clusters, otherwise
- * we need to record the offset of the next level extent block
- * as we may overwrite it. */
- if (!el->l_tree_depth)
- delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
- + ocfs2_clusters_to_blocks(osb->sb,
- le32_to_cpu(el->l_recs[i].e_clusters));
- else
- next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
-
- if (!el->l_recs[i].e_clusters) {
- /* if we deleted the whole extent record, then clear
- * out the other fields and update the extent
- * list. For depth > 0 trees, we've already recorded
- * the extent block in 'next_eb' */
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_blkno = 0;
- BUG_ON(!el->l_next_free_rec);
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
-
- depth = le16_to_cpu(el->l_tree_depth);
- if (!fe->i_clusters) {
- /* trunc to zero is a special case. */
- el->l_tree_depth = 0;
- fe->i_last_eb_blk = 0;
- } else if (last_eb)
- fe->i_last_eb_blk = last_eb->h_blkno;
+ trace_ocfs2_cache_block_dealloc(type, slot,
+ (unsigned long long)suballoc,
+ (unsigned long long)blkno, bit);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ item->free_bg = suballoc;
+ item->free_blk = blkno;
+ item->free_bit = bit;
+ item->free_next = fl->f_first;
+
+ fl->f_first = item;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ struct ocfs2_extent_block *eb)
+{
+ return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(eb->h_suballoc_slot),
+ le64_to_cpu(eb->h_suballoc_loc),
+ le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(eb->h_suballoc_bit));
+}
+
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return 0;
+}
+
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+ unsigned int from, unsigned int to,
+ struct page *page, int zero, u64 *phys)
+{
+ int ret, partial = 0;
+
+ ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
+ if (ret)
+ mlog_errno(ret);
+
+ if (zero)
+ zero_user_segment(page, from, to);
+
+ /*
+ * Need to set the buffers we zero'd into uptodate
+ * here if they aren't - ocfs2_map_page_blocks()
+ * might've skipped some
+ */
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ else if (ocfs2_should_order_data(inode)) {
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0)
+ mlog_errno(ret);
}
- if (last_eb) {
- /* If there will be a new last extent block, then by
- * definition, there cannot be any leaves to the right of
- * him. */
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- last_eb->h_next_leaf_blk = 0;
- status = ocfs2_journal_dirty(handle, last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ if (!partial)
+ SetPageUptodate(page);
+
+ flush_dcache_page(page);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+ loff_t end, struct page **pages,
+ int numpages, u64 phys, handle_t *handle)
+{
+ int i;
+ struct page *page;
+ unsigned int from, to = PAGE_CACHE_SIZE;
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ if (numpages == 0)
+ goto out;
+
+ to = PAGE_CACHE_SIZE;
+ for(i = 0; i < numpages; i++) {
+ page = pages[i];
+
+ from = start & (PAGE_CACHE_SIZE - 1);
+ if ((end >> PAGE_CACHE_SHIFT) == page->index)
+ to = end & (PAGE_CACHE_SIZE - 1);
+
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+
+ ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
+ &phys);
+
+ start = (page->index + 1) << PAGE_CACHE_SHIFT;
+ }
+out:
+ if (pages)
+ ocfs2_unlock_and_free_pages(pages, numpages);
+}
+
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num)
+{
+ int numpages, ret = 0;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index;
+ loff_t last_page_bytes;
+
+ BUG_ON(start > end);
+
+ numpages = 0;
+ last_page_bytes = PAGE_ALIGN(end);
+ index = start >> PAGE_CACHE_SHIFT;
+ do {
+ pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
+ if (!pages[numpages]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
+
+ numpages++;
+ index++;
+ } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+ if (ret != 0) {
+ if (pages)
+ ocfs2_unlock_and_free_pages(pages, numpages);
+ numpages = 0;
}
- /* if our tree depth > 0, update all the tree blocks below us. */
- while (depth) {
- mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
- depth, (unsigned long long)next_eb);
- status = ocfs2_read_block(osb, next_eb, &eb_bh,
- OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ *num = numpages;
+
+ return ret;
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num)
+{
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+ (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+ return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+ u64 range_start, u64 range_end)
+{
+ int ret = 0, numpages;
+ struct page **pages = NULL;
+ u64 phys;
+ unsigned int ext_flags;
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * File systems which don't support sparse files zero on every
+ * extend.
+ */
+ if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
+ return 0;
+
+ pages = kcalloc(ocfs2_pages_per_cluster(sb),
+ sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (range_start == range_end)
+ goto out;
+
+ ret = ocfs2_extent_map_get_blocks(inode,
+ range_start >> sb->s_blocksize_bits,
+ &phys, NULL, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Tail is a hole, or is marked unwritten. In either case, we
+ * can count on read and write to return/push zero's.
+ */
+ if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
+ goto out;
+
+ ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+ &numpages);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+ numpages, phys, handle);
+
+ /*
+ * Initiate writeout of the pages we zero'd here. We don't
+ * wait on them - the truncate_inode_pages() call later will
+ * do that for us.
+ */
+ ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
+ range_end - 1);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ kfree(pages);
+
+ return ret;
+}
+
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ memset(&di->id2, 0, blocksize -
+ offsetof(struct ocfs2_dinode, id2) -
+ xattrsize);
+ else
+ memset(&di->id2, 0, blocksize -
+ offsetof(struct ocfs2_dinode, id2));
+}
+
+void ocfs2_dinode_new_extent_list(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ ocfs2_zero_dinode_id2_with_xattr(inode, di);
+ di->id2.i_list.l_tree_depth = 0;
+ di->id2.i_list.l_next_free_rec = 0;
+ di->id2.i_list.l_count = cpu_to_le16(
+ ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
+}
+
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ /*
+ * We clear the entire i_data structure here so that all
+ * fields can be properly initialized.
+ */
+ ocfs2_zero_dinode_id2_with_xattr(inode, di);
+
+ idata->id_count = cpu_to_le16(
+ ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
+}
+
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret, i, has_data, num_pages = 0;
+ int need_free = 0;
+ u32 bit_off, num;
+ handle_t *handle;
+ u64 uninitialized_var(block);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct page **pages = NULL;
+ loff_t end = osb->s_clustersize;
+ struct ocfs2_extent_tree et;
+ int did_quota = 0;
+
+ has_data = i_size_read(inode) ? 1 : 0;
+
+ if (has_data) {
+ pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
+ sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
- eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
+
+ ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- el = &(eb->h_list);
+ }
- status = ocfs2_journal_access(handle, inode, eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ handle = ocfs2_start_trans(osb,
+ ocfs2_inline_to_extents_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (has_data) {
+ unsigned int page_end;
+ u64 phys;
+
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
+ goto out_commit;
+ did_quota = 1;
+
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
+ &num);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
}
- BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
- BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+ /*
+ * Save two copies, one for insert, and one that can
+ * be changed by ocfs2_map_and_dirty_page() below.
+ */
+ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- i = le16_to_cpu(el->l_next_free_rec) - 1;
+ /*
+ * Non sparse file systems zero on extend, so no need
+ * to do that now.
+ */
+ if (!ocfs2_sparse_alloc(osb) &&
+ PAGE_CACHE_SIZE < osb->s_clustersize)
+ end = PAGE_CACHE_SIZE;
+
+ ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+ if (ret) {
+ mlog_errno(ret);
+ need_free = 1;
+ goto out_commit;
+ }
- mlog(0, "extent block %llu, before: record %d: "
- "(%u, %u, %llu), next = %u\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), i,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
- (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
- le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-
- next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
- /* bottom-most block requires us to delete data.*/
- if (!el->l_tree_depth)
- delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
- + ocfs2_clusters_to_blocks(osb->sb,
- le32_to_cpu(el->l_recs[i].e_clusters));
- if (!el->l_recs[i].e_clusters) {
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_blkno = 0;
- BUG_ON(!el->l_next_free_rec);
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
- mlog(0, "extent block %llu, after: record %d: "
- "(%u, %u, %llu), next = %u\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), i,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
- (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ /*
+ * This should populate the 1st page for us and mark
+ * it up to date.
+ */
+ ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ need_free = 1;
+ goto out_commit;
}
- if (!el->l_next_free_rec) {
- mlog(0, "deleting this extent block.\n");
+ page_end = PAGE_CACHE_SIZE;
+ if (PAGE_CACHE_SIZE > osb->s_clustersize)
+ page_end = osb->s_clustersize;
- ocfs2_remove_from_cache(inode, eb_bh);
+ for (i = 0; i < num_pages; i++)
+ ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
+ pages[i], i > 0, &phys);
+ }
- BUG_ON(el->l_recs[0].e_clusters);
- BUG_ON(el->l_recs[0].e_cpos);
- BUG_ON(el->l_recs[0].e_blkno);
- if (eb->h_suballoc_slot == 0) {
- /*
- * This code only understands how to
- * lock the suballocator in slot 0,
- * which is fine because allocation is
- * only ever done out of that
- * suballocator too. A future version
- * might change that however, so avoid
- * a free if we don't know how to
- * handle it. This way an fs incompat
- * bit will not be necessary.
- */
- status = ocfs2_free_extent_block(handle,
- tc->tc_ext_alloc_inode,
- tc->tc_ext_alloc_bh,
- eb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_dinode_new_extent_list(inode, di);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ if (has_data) {
+ /*
+ * An error at this point should be extremely rare. If
+ * this proves to be false, we could always re-build
+ * the in-inode data from our pages.
+ */
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ need_free = 1;
+ goto out_commit;
}
- brelse(eb_bh);
- eb_bh = NULL;
- depth--;
+
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
}
- BUG_ON(!delete_blk);
- status = ocfs2_truncate_log_append(osb, handle, delete_blk,
- clusters_to_del);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+out_commit:
+ if (ret < 0 && did_quota)
+ dquot_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num);
}
- status = 0;
-bail:
- if (!status)
- ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
- else
- ocfs2_extent_map_drop(inode, 0);
- mlog_exit(status);
- return status;
+
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+
+out:
+ if (pages) {
+ ocfs2_unlock_and_free_pages(pages, num_pages);
+ kfree(pages);
+ }
+
+ return ret;
}
/*
@@ -1772,283 +6999,388 @@ bail:
*/
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc)
+ struct buffer_head *di_bh)
{
- int status, i, credits, tl_sem = 0;
- u32 clusters_to_del, target_i_clusters;
- u64 last_eb = 0;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
+ int status = 0, i, flags = 0;
+ u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
+ u64 blkno = 0;
struct ocfs2_extent_list *el;
- struct buffer_head *last_eb_bh;
- struct ocfs2_journal_handle *handle = NULL;
- struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+ u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+ struct ocfs2_extent_tree et;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
+ i_size_read(inode));
- mlog_entry_void();
+ path = ocfs2_new_path(di_bh, &di->id2.i_list,
+ ocfs2_journal_access_di);
+ if (!path) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ocfs2_extent_map_trunc(inode, new_highest_cpos);
- target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
- i_size_read(inode));
+start:
+ /*
+ * Check that we still have allocation to delete.
+ */
+ if (OCFS2_I(inode)->ip_clusters == 0) {
+ status = 0;
+ goto bail;
+ }
- last_eb_bh = tc->tc_last_eb_bh;
- tc->tc_last_eb_bh = NULL;
+ /*
+ * Truncate always works against the rightmost tree branch.
+ */
+ status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ trace_ocfs2_commit_truncate(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ new_highest_cpos,
+ OCFS2_I(inode)->ip_clusters,
+ path->p_tree_depth);
- if (fe->id2.i_list.l_tree_depth) {
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- el = &eb->h_list;
- } else
- el = &fe->id2.i_list;
- last_eb = le64_to_cpu(fe->i_last_eb_blk);
-start:
- mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
- "last_eb = %llu, fe->i_last_eb_blk = %llu, "
- "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
- le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
- (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
- le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
-
- if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
- mlog(0, "last_eb changed!\n");
- BUG_ON(!fe->id2.i_list.l_tree_depth);
- last_eb = le64_to_cpu(fe->i_last_eb_blk);
- /* i_last_eb_blk may have changed, read it if
- * necessary. We don't have to worry about the
- * truncate to zero case here (where there becomes no
- * last_eb) because we never loop back after our work
- * is done. */
- if (last_eb_bh) {
- brelse(last_eb_bh);
- last_eb_bh = NULL;
- }
-
- status = ocfs2_read_block(osb, last_eb,
- &last_eb_bh, OCFS2_BH_CACHED,
- inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
- el = &(eb->h_list);
+ /*
+ * By now, el will point to the extent list on the bottom most
+ * portion of this tree. Only the tail record is considered in
+ * each pass.
+ *
+ * We handle the following cases, in order:
+ * - empty extent: delete the remaining branch
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (truncate has completed)
+ */
+ el = path_leaf_el(path);
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has empty extent block at %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)path_leaf_bh(path)->b_blocknr);
+ status = -EROFS;
+ goto bail;
}
- /* by now, el will point to the extent list on the bottom most
- * portion of this tree. */
i = le16_to_cpu(el->l_next_free_rec) - 1;
- if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
- clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
- else
- clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
- le32_to_cpu(el->l_recs[i].e_cpos)) -
- target_i_clusters;
-
- mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+ rec = &el->l_recs[i];
+ flags = rec->e_flags;
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
- mutex_lock(&tl_inode->i_mutex);
- tl_sem = 1;
- /* ocfs2_truncate_log_needs_flush guarantees us at least one
- * record is free for use. If there isn't any, we flush to get
- * an empty truncate log. */
- if (ocfs2_truncate_log_needs_flush(osb)) {
- status = __ocfs2_flush_truncate_log(osb);
- if (status < 0) {
- mlog_errno(status);
+ if (i == 0 && ocfs2_is_empty_extent(rec)) {
+ /*
+ * Lower levels depend on this never happening, but it's best
+ * to check it up here before changing the tree.
+ */
+ if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+ ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ "extent record, depth %u\n", inode->i_ino,
+ le16_to_cpu(root_el->l_tree_depth));
+ status = -EROFS;
goto bail;
}
- }
-
- credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
- fe, el);
- handle = ocfs2_start_trans(osb, NULL, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
+ } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+ /*
+ * Truncate entire record.
+ */
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = ocfs2_rec_clusters(el, rec);
+ blkno = le64_to_cpu(rec->e_blkno);
+ } else if (range > new_highest_cpos) {
+ /*
+ * Partial truncate. it also should be
+ * the last truncate we're doing.
+ */
+ trunc_cpos = new_highest_cpos;
+ trunc_len = range - new_highest_cpos;
+ coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+ blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
+ } else {
+ /*
+ * Truncate completed, leave happily.
+ */
+ status = 0;
goto bail;
}
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
- if (status < 0)
- mlog_errno(status);
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
- last_eb_bh, handle, tc);
+ status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags, &dealloc,
+ refcount_loc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mutex_unlock(&tl_inode->i_mutex);
- tl_sem = 0;
+ ocfs2_reinit_path(path, 1);
- ocfs2_commit_trans(handle);
- handle = NULL;
+ /*
+ * The check above will catch the case where we've truncated
+ * away all allocation.
+ */
+ goto start;
- BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
- if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
- goto start;
bail:
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_schedule_truncate_log_flush(osb, 1);
- if (tl_sem)
- mutex_unlock(&tl_inode->i_mutex);
+ ocfs2_run_deallocs(osb, &dealloc);
- if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_free_path(path);
- if (last_eb_bh)
- brelse(last_eb_bh);
-
- /* This will drop the ext_alloc cluster lock for us */
- ocfs2_free_truncate_context(tc);
-
- mlog_exit(status);
return status;
}
-
/*
- * Expects the inode to already be locked. This will figure out which
- * inodes need to be locked and will put them on the returned truncate
- * context.
+ * 'start' is inclusive, 'end' is not.
*/
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc)
-{
- int status, metadata_delete;
- unsigned int new_i_clusters;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
- struct buffer_head *last_eb_bh = NULL;
- struct inode *ext_alloc_inode = NULL;
- struct buffer_head *ext_alloc_bh = NULL;
-
- mlog_entry_void();
-
- *tc = NULL;
-
- new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
- i_size_read(inode));
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
- "%llu\n", fe->i_clusters, new_i_clusters,
- (unsigned long long)fe->i_size);
-
- if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
- ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
- "%u and size %llu whereas struct inode has "
- "cluster count %u and size %llu which caused an "
- "invalid truncate to %u clusters.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->i_clusters),
- (unsigned long long)le64_to_cpu(fe->i_size),
- OCFS2_I(inode)->ip_clusters, i_size_read(inode),
- new_i_clusters);
- mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
- status = -EIO;
- goto bail;
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+ unsigned int start, unsigned int end, int trunc)
+{
+ int ret;
+ unsigned int numbytes;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+ if (end > i_size_read(inode))
+ end = i_size_read(inode);
+
+ BUG_ON(start > end);
+
+ if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+ !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
+ !ocfs2_supports_inline_data(osb)) {
+ ocfs2_error(inode->i_sb,
+ "Inline data flags for inode %llu don't agree! "
+ "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ le16_to_cpu(di->i_dyn_features),
+ OCFS2_I(inode)->ip_dyn_features,
+ osb->s_feature_incompat);
+ ret = -EROFS;
+ goto out;
}
- *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
- if (!(*tc)) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
}
- metadata_delete = 0;
- if (fe->id2.i_list.l_tree_depth) {
- /* If we have a tree, then the truncate may result in
- * metadata deletes. Figure this out from the
- * rightmost leaf block.*/
- status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
- brelse(last_eb_bh);
- status = -EIO;
- goto bail;
- }
- el = &(eb->h_list);
- if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
- metadata_delete = 1;
+ numbytes = end - start;
+ memset(idata->id_data + start, 0, numbytes);
+
+ /*
+ * No need to worry about the data page here - it's been
+ * truncated already and inline data doesn't need it for
+ * pushing zero's to disk, so we'll let readpage pick it up
+ * later.
+ */
+ if (trunc) {
+ i_size_write(inode, start);
+ di->i_size = cpu_to_le64(start);
}
- (*tc)->tc_last_eb_bh = last_eb_bh;
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- if (metadata_delete) {
- mlog(0, "Will have to delete metadata for this trunc. "
- "locking allocator.\n");
- ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
- if (!ext_alloc_inode) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
- mutex_lock(&ext_alloc_inode->i_mutex);
- (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, di_bh);
- status = ocfs2_meta_lock(ext_alloc_inode,
- NULL,
- &ext_alloc_bh,
- 1);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ return ret;
+}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 count)
+{
+ u64 discard, bcount;
+
+ bcount = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+ return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 max, u32 minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
+
+ if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+ return 0;
+
+ trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+ start, max, minbits);
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
}
- (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
- (*tc)->tc_ext_alloc_locked = 1;
- }
+ start = next + 1;
- status = 0;
-bail:
- if (status < 0) {
- if (*tc)
- ocfs2_free_truncate_context(*tc);
- *tc = NULL;
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
}
- mlog_exit_void();
- return status;
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
}
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- if (tc->tc_ext_alloc_inode) {
- if (tc->tc_ext_alloc_locked)
- ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, trimmed, first_group, last_group, group;
+ int ret, cnt;
+ u32 first_bit, last_bit, minlen;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+
+ if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
+ return -EINVAL;
- mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
- iput(tc->tc_ext_alloc_inode);
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
}
- if (tc->tc_ext_alloc_bh)
- brelse(tc->tc_ext_alloc_bh);
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (tc->tc_last_eb_bh)
- brelse(tc->tc_last_eb_bh);
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
- kfree(tc);
+ len = range->len >> osb->s_clustersize_bits;
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ trace_ocfs2_trim_fs(start, len, minlen);
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ trimmed = 0;
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg;
+ else
+ last_bit = first_bit + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 12ba897743f..ca381c58412 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,19 +26,135 @@
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation. Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions. It needs the ocfs2_caching_info structure associated with
+ * I/O on the tree. With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+ struct ocfs2_extent_tree_operations *et_ops;
+ struct buffer_head *et_root_bh;
+ struct ocfs2_extent_list *et_root_el;
+ struct ocfs2_caching_info *et_ci;
+ ocfs2_journal_access_func et_root_journal_access;
+ void *et_object;
+ unsigned int et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh);
+
+/*
+ * Read an extent block into *bh. If *bh is NULL, a bh will be
+ * allocated. This is a cached read. The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
+ struct buffer_head **bh);
+
struct ocfs2_alloc_context;
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 blkno,
+int ocfs2_insert_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos,
+ u64 start_blk,
u32 new_clusters,
+ u8 flags,
struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+ RESTART_NONE = 0,
+ RESTART_TRANS,
+ RESTART_META
+};
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret);
+struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags);
+int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc);
+
int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_dinode *fe);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+ struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
{
/*
* Rather than do all the work of determining how much we need
@@ -48,9 +164,14 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
- return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+ return le16_to_cpu(root_el->l_tree_depth) + 2;
}
+void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+ struct buffer_head *di_bh);
+
int ocfs2_truncate_log_init(struct ocfs2_super *osb);
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
@@ -61,22 +182,143 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode **tl_copy);
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+ handle_t *handle,
+ u64 start_blk,
+ unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+ struct ocfs2_per_slot_free_list *c_first_suballocator;
+ struct ocfs2_cached_block_free *c_global_allocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+ c->c_first_suballocator = NULL;
+ c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ u64 blkno, unsigned int bit);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 suballoc, u64 blkno,
+ unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+ return c->c_global_allocator != NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+ struct ocfs2_cached_dealloc_ctxt *ctxt);
struct ocfs2_truncate_context {
- struct inode *tc_ext_alloc_inode;
- struct buffer_head *tc_ext_alloc_bh;
+ struct ocfs2_cached_dealloc_ctxt tc_dealloc;
int tc_ext_alloc_locked; /* is it cluster locked? */
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
struct buffer_head *tc_last_eb_bh;
};
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc);
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+ u64 range_start, u64 range_end);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc);
+ struct buffer_head *di_bh);
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+ unsigned int start, unsigned int end, int trunc);
+
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *rec)
+{
+ /*
+ * Cluster count in extent records is slightly different
+ * between interior nodes and leaf nodes. This is to support
+ * unwritten extents which need a flags field in leaf node
+ * records, thus shrinking the available space for a clusters
+ * field.
+ */
+ if (el->l_tree_depth)
+ return le32_to_cpu(rec->e_int_clusters);
+ else
+ return le16_to_cpu(rec->e_leaf_clusters);
+}
+
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+ return !rec->e_leaf_clusters;
+}
+
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+ unsigned int from, unsigned int to,
+ struct page *page, int zero, u64 *phys);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH 5
+
+struct ocfs2_path {
+ int p_tree_depth;
+ ocfs2_journal_access_func p_root_access;
+ struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+ handle_t *handle,
+ struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right);
#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3d7c082a8f5..4a231a166cf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,8 +24,11 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mpage.h>
+#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -37,8 +40,11 @@
#include "file.h"
#include "inode.h"
#include "journal.h"
+#include "suballoc.h"
#include "super.h"
#include "symlink.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -53,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
void *kaddr;
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_symlink_get_block(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
@@ -64,23 +71,16 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- &bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)fe->i_blkno, 7, fe->i_signature);
- goto bail;
- }
-
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
+ err = -ENOMEM;
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
@@ -93,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
+ err = -ENOMEM;
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
@@ -103,7 +104,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh_result->b_page);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
@@ -111,7 +112,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -123,21 +124,22 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
err = 0;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
- mlog_exit(err);
return err;
}
-static int ocfs2_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
int err = 0;
- u64 p_blkno, past_eof;
+ unsigned int ext_flags;
+ u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
+ u64 p_blkno, count, past_eof;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -149,17 +151,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- /* this can happen if another node truncs after our extend! */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
- OCFS2_I(inode)->ip_clusters))
- err = -EIO;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- if (err)
- goto bail;
-
- err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
- NULL);
+ err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
+ &ext_flags);
if (err) {
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,20 +160,50 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- map_bh(bh_result, inode->i_sb, p_blkno);
+ if (max_blocks < count)
+ count = max_blocks;
- if (bh_result->b_blocknr == 0) {
- err = -EIO;
- mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
- (unsigned long long)iblock,
- (unsigned long long)p_blkno,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ /*
+ * ocfs2 never allocates in this function - the only time we
+ * need to use BH_New is when we're extending i_size on a file
+ * system which doesn't support holes, in which case BH_New
+ * allows __block_write_begin() to zero.
+ *
+ * If we see this on a sparse file system, then a truncate has
+ * raced us and removed the cluster. In this case, we clear
+ * the buffers dirty and uptodate bits and let the buffer code
+ * ignore it as a hole.
+ */
+ if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+ clear_buffer_dirty(bh_result);
+ clear_buffer_uptodate(bh_result);
+ goto bail;
+ }
+
+ /* Treat the unwritten extent as a hole for zeroing purposes. */
+ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+ map_bh(bh_result, inode->i_sb, p_blkno);
+
+ bh_result->b_size = count << inode->i_blkbits;
+
+ if (!ocfs2_sparse_alloc(osb)) {
+ if (p_blkno == 0) {
+ err = -EIO;
+ mlog(ML_ERROR,
+ "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+ (unsigned long long)iblock,
+ (unsigned long long)p_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+ dump_stack();
+ goto bail;
+ }
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
+ trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)past_eof);
if (create && (iblock >= past_eof))
set_buffer_new(bh_result);
@@ -188,19 +211,79 @@ bail:
if (err < 0)
err = -EIO;
- mlog_exit(err);
return err;
}
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+ struct buffer_head *di_bh)
+{
+ void *kaddr;
+ loff_t size;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ return -EROFS;
+ }
+
+ size = i_size_read(inode);
+
+ if (size > PAGE_CACHE_SIZE ||
+ size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has with inline data has bad size: %Lu",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)size);
+ return -EROFS;
+ }
+
+ kaddr = kmap_atomic(page);
+ if (size)
+ memcpy(kaddr, di->id2.i_data.id_data, size);
+ /* Clear the remaining part of the page */
+ memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+
+ SetPageUptodate(page);
+
+ return 0;
+}
+
+static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_inline_data(inode, page, di_bh);
+out:
+ unlock_page(page);
+
+ brelse(di_bh);
+ return ret;
+}
+
static int ocfs2_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
int ret, unlock = 1;
- mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+ (page ? page->index : 0));
- ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
+ ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
@@ -208,128 +291,139 @@ static int ocfs2_readpage(struct file *file, struct page *page)
goto out;
}
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
+ ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
+ goto out_inode_unlock;
+ }
/*
* i_size might have just been updated as we grabed the meta lock. We
* might now be discovering a truncate that hit on another node.
* block_read_full_page->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
- * (generic_file_read, fault->nopage) are clever enough to check i_size
+ * (generic_file_read, vm_ops->fault) are clever enough to check i_size
* and notice that the page they just read isn't needed.
*
* XXX sys_readahead() seems to get that wrong?
*/
if (start >= i_size_read(inode)) {
- char *addr = kmap(page);
- memset(addr, 0, PAGE_SIZE);
- flush_dcache_page(page);
- kunmap(page);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
ret = 0;
goto out_alloc;
}
- ret = ocfs2_data_lock_with_page(inode, 0, page);
- if (ret != 0) {
- if (ret == AOP_TRUNCATED_PAGE)
- unlock = 0;
- mlog_errno(ret);
- goto out_alloc;
- }
-
- ret = block_read_full_page(page, ocfs2_get_block);
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ ret = ocfs2_readpage_inline(inode, page);
+ else
+ ret = block_read_full_page(page, ocfs2_get_block);
unlock = 0;
- ocfs2_data_unlock(inode, 0);
out_alloc:
up_read(&OCFS2_I(inode)->ip_alloc_sem);
- ocfs2_meta_unlock(inode, 0);
+out_inode_unlock:
+ ocfs2_inode_unlock(inode, 0);
out:
if (unlock)
unlock_page(page);
- mlog_exit(ret);
return ret;
}
-/* Note: Because we don't support holes, our allocation has
- * already happened (allocation writes zeros to the file data)
- * so we don't have to worry about ordered writes in
- * ocfs2_writepage.
+/*
+ * This is used only for read-ahead. Failures or difficult to handle
+ * situations are safe to ignore.
*
- * ->writepage is called during the process of invalidating the page cache
- * during blocked lock processing. It can't block on any cluster locks
- * to during block mapping. It's relying on the fact that the block
- * mapping can't have disappeared under the dirty pages that it is
- * being asked to write back.
+ * Right now, we don't bother with BH_Boundary - in-inode extent lists
+ * are quite large (243 extents on 4k blocks), so most inodes don't
+ * grow out to a tree. If need be, detecting boundary extents could
+ * trivially be added in a future version of ocfs2_get_block().
*/
-static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
{
- int ret;
-
- mlog_entry("(0x%p)\n", page);
-
- ret = block_write_full_page(page, ocfs2_get_block, wbc);
+ int ret, err = -EIO;
+ struct inode *inode = mapping->host;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ loff_t start;
+ struct page *last;
- mlog_exit(ret);
+ /*
+ * Use the nonblocking flag for the dlm code to avoid page
+ * lock inversion, but don't bother with retrying.
+ */
+ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
+ if (ret)
+ return err;
- return ret;
-}
+ if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ ocfs2_inode_unlock(inode, 0);
+ return err;
+ }
-/* This can also be called from ocfs2_write_zero_page() which has done
- * it's own cluster locking. */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to)
-{
- int ret;
+ /*
+ * Don't bother with inline-data. There isn't anything
+ * to read-ahead in that case anyway...
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ goto out_unlock;
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ /*
+ * Check whether a remote node truncated this file - we just
+ * drop out in that case as it's not worth handling here.
+ */
+ last = list_entry(pages->prev, struct page, lru);
+ start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+ if (start >= i_size_read(inode))
+ goto out_unlock;
- ret = block_prepare_write(page, from, to, ocfs2_get_block);
+ err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
+out_unlock:
+ up_read(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 0);
- return ret;
+ return err;
}
-/*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback. It must be able to perform its own locking around
- * ocfs2_get_block().
+/* Note: Because we don't support holes, our allocation has
+ * already happened (allocation writes zeros to the file data)
+ * so we don't have to worry about ordered writes in
+ * ocfs2_writepage.
+ *
+ * ->writepage is called during the process of invalidating the page cache
+ * during blocked lock processing. It can't block on any cluster locks
+ * to during block mapping. It's relying on the fact that the block
+ * mapping can't have disappeared under the dirty pages that it is
+ * being asked to write back.
*/
-static int ocfs2_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
- int ret;
-
- mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
- ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
- if (ret != 0) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_prepare_write_nolock(inode, page, from, to);
+ trace_ocfs2_writepage(
+ (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+ page->index);
- ocfs2_meta_unlock(inode, 0);
-out:
- mlog_exit(ret);
- return ret;
+ return block_write_full_page(page, ocfs2_get_block, wbc);
}
/* Taken from ext3. We don't necessarily need the full blown
* functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
+int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -355,128 +449,6 @@ static int walk_page_buffers( handle_t *handle,
return ret;
}
-struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_journal_handle *handle = NULL;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
- if (!handle) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = walk_page_buffers(handle->k_handle,
- page_buffers(page),
- from, to, NULL,
- ocfs2_journal_dirty_data);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (handle)
- ocfs2_commit_trans(handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
-static int ocfs2_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- int ret;
- struct buffer_head *di_bh = NULL;
- struct inode *inode = page->mapping->host;
- struct ocfs2_journal_handle *handle = NULL;
- struct ocfs2_dinode *di;
-
- mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
- /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
- * us to continue here without rechecking the I/O against
- * changed inode values.
- *
- * 1) We're currently holding the inode alloc lock, so no
- * nodes can change it underneath us.
- *
- * 2) We've had to take the metadata lock at least once
- * already to check for extending writes, suid removal, etc.
- * The meta data update code then ensures that we don't get a
- * stale inode allocation image (i_size, i_clusters, etc).
- */
-
- ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, 1, page);
- if (ret != 0) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_data_lock_with_page(inode, 1, page);
- if (ret != 0) {
- mlog_errno(ret);
- goto out_unlock_meta;
- }
-
- handle = ocfs2_start_walk_page_trans(inode, page, from, to);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_unlock_data;
- }
-
- /* Mark our buffer early. We'd rather catch this error up here
- * as opposed to after a successful commit_write which would
- * require us to set back inode->i_size. */
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- /* might update i_size */
- ret = generic_commit_write(file, page, from, to);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- di = (struct ocfs2_dinode *)di_bh->b_data;
-
- /* ocfs2_mark_inode_dirty() is too heavy to use here. */
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-
- inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
-out_commit:
- ocfs2_commit_trans(handle);
-out_unlock_data:
- ocfs2_data_unlock(inode, 1);
-out_unlock_meta:
- ocfs2_meta_unlock(inode, 1);
-out:
- if (di_bh)
- brelse(di_bh);
-
- mlog_exit(ret);
- return ret;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
@@ -484,13 +456,14 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
int err = 0;
struct inode *inode = mapping->host;
- mlog_entry("(block = %llu)\n", (unsigned long long)block);
+ trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)block);
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
*/
if (!INODE_JOURNAL(inode)) {
- err = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ err = ocfs2_inode_lock(inode, NULL, 0);
if (err) {
if (err != -ENOENT)
mlog_errno(err);
@@ -499,12 +472,13 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
down_read(&OCFS2_I(inode)->ip_alloc_sem);
}
- err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
- NULL);
+ if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+ NULL);
if (!INODE_JOURNAL(inode)) {
up_read(&OCFS2_I(inode)->ip_alloc_sem);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
}
if (err) {
@@ -514,12 +488,9 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
goto bail;
}
-
bail:
status = err ? 0 : p_blkno;
- mlog_exit((int)status);
-
return status;
}
@@ -535,14 +506,16 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret;
- u64 vbo_max; /* file offset, max_blocks from iblock */
- u64 p_blkno;
- int contig_blocks;
+ u64 p_blkno, inode_blocks, contig_blocks;
+ unsigned int ext_flags;
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -550,22 +523,12 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
* nicely aligned and of the right size, so there's no need
* for us to check any of that. */
- vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- if ((iblock + max_blocks) >
- ocfs2_clusters_to_blocks(inode->i_sb,
- OCFS2_I(inode)->ip_clusters)) {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- ret = -EIO;
- goto bail;
- }
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
/* This figures out the size of the next contiguous block, and
* our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
- &contig_blocks);
+ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+ &contig_blocks, &ext_flags);
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -573,7 +536,19 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail;
}
- map_bh(bh_result, inode->i_sb, p_blkno);
+ /* We should already CoW the refcounted extent in case of create. */
+ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+
+ /*
+ * get_more_blocks() expects us to describe a hole by clearing
+ * the mapped bit on bh_result().
+ *
+ * Consider an unwritten extent as a hole.
+ */
+ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ else
+ clear_buffer_mapped(bh_result);
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
@@ -584,72 +559,1523 @@ bail:
return ret;
}
-/*
+/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
void *private)
{
- struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (ocfs2_iocb_is_sem_locked(iocb))
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
- up_read(&inode->i_alloc_sem);
- ocfs2_rw_unlock(inode, 0);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+}
+
+static int ocfs2_releasepage(struct page *page, gfp_t wait)
+{
+ if (!page_has_buffers(page))
+ return 0;
+ return try_to_free_buffers(page);
}
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we are appending. */
+ if (i_size_read(inode) <= offset)
+ return 0;
+
+ return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+ u32 cpos,
+ unsigned int *start,
+ unsigned int *end)
+{
+ unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+
+ if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+ unsigned int cpp;
+
+ cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+
+ cluster_start = cpos % cpp;
+ cluster_start = cluster_start << osb->s_clustersize_bits;
+
+ cluster_end = cluster_start + osb->s_clustersize;
+ }
+
+ BUG_ON(cluster_start > PAGE_SIZE);
+ BUG_ON(cluster_end > PAGE_SIZE);
+
+ if (start)
+ *start = cluster_start;
+ if (end)
+ *end = cluster_end;
+}
+
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+ struct ocfs2_super *osb, u32 cpos,
+ unsigned from, unsigned to)
+{
+ void *kaddr;
+ unsigned int cluster_start, cluster_end;
+
+ ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+
+ kaddr = kmap_atomic(page);
+
+ if (from || to) {
+ if (from > cluster_start)
+ memset(kaddr + cluster_start, 0, from - cluster_start);
+ if (to < cluster_end)
+ memset(kaddr + to, 0, cluster_end - to);
+ } else {
+ memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+ }
+
+ kunmap_atomic(kaddr);
+}
+
+/*
+ * Nonsparse file systems fully allocate before we get to the write
+ * code. This prevents ocfs2_write() from tagging the write as an
+ * allocating one, which means ocfs2_map_page_blocks() might try to
+ * read-in the blocks at the tail of our file. Avoid reading them by
+ * testing i_size against each block offset.
+ */
+static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
+ unsigned int block_start)
+{
+ u64 offset = page_offset(page) + block_start;
+
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ return 1;
+
+ if (i_size_read(inode) > offset)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Some of this taken from __block_write_begin(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ struct inode *inode, unsigned int from,
+ unsigned int to, int new)
+{
+ int ret = 0;
+ struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+ unsigned int block_end, block_start;
+ unsigned int bsize = 1 << inode->i_blkbits;
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, bsize, 0);
+
+ head = page_buffers(page);
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ bh = bh->b_this_page, block_start += bsize) {
+ block_end = block_start + bsize;
+
+ clear_buffer_new(bh);
+
+ /*
+ * Ignore blocks outside of our i/o range -
+ * they may belong to unallocated clusters.
+ */
+ if (block_start >= to || block_end <= from) {
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+ continue;
+ }
+
+ /*
+ * For an allocating write with cluster size >= page
+ * size, we always write the entire page.
+ */
+ if (new)
+ set_buffer_new(bh);
+
+ if (!buffer_mapped(bh)) {
+ map_bh(bh, inode->i_sb, *p_blkno);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+ !buffer_new(bh) &&
+ ocfs2_should_read_blk(inode, page, block_start) &&
+ (block_start < from || block_end > to)) {
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++=bh;
+ }
+
+ *p_blkno = *p_blkno + 1;
+ }
+
+ /*
+ * If we issued read requests - let them complete.
+ */
+ while(wait_bh > wait) {
+ wait_on_buffer(*--wait_bh);
+ if (!buffer_uptodate(*wait_bh))
+ ret = -EIO;
+ }
+
+ if (ret == 0 || !new)
+ return ret;
+
+ /*
+ * If we get -EIO above, zero out any newly allocated blocks
+ * to avoid exposing stale data.
+ */
+ bh = head;
+ block_start = 0;
+ do {
+ block_end = block_start + bsize;
+ if (block_end <= from)
+ goto next_bh;
+ if (block_start >= to)
+ break;
+
+ zero_user(page, block_start, bh->b_size);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+
+next_bh:
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ return ret;
+}
+
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES 1
+#else
+#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
+/*
+ * Describe the state of a single cluster to be written to.
+ */
+struct ocfs2_write_cluster_desc {
+ u32 c_cpos;
+ u32 c_phys;
+ /*
+ * Give this a unique field because c_phys eventually gets
+ * filled.
+ */
+ unsigned c_new;
+ unsigned c_unwritten;
+ unsigned c_needs_zero;
+};
+
+struct ocfs2_write_ctxt {
+ /* Logical cluster position / len of write */
+ u32 w_cpos;
+ u32 w_clen;
+
+ /* First cluster allocated in a nonsparse extend */
+ u32 w_first_new_cpos;
+
+ struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
+
+ /*
+ * This is true if page_size > cluster_size.
+ *
+ * It triggers a set of special cases during write which might
+ * have to deal with allocating writes to partial pages.
+ */
+ unsigned int w_large_pages;
+
+ /*
+ * Pages involved in this write.
+ *
+ * w_target_page is the page being written to by the user.
+ *
+ * w_pages is an array of pages which always contains
+ * w_target_page, and in the case of an allocating write with
+ * page_size < cluster size, it will contain zero'd and mapped
+ * pages adjacent to w_target_page which need to be written
+ * out in so that future reads from that region will get
+ * zero's.
+ */
+ unsigned int w_num_pages;
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
+ struct page *w_target_page;
+
+ /*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
+ * ocfs2_write_end() uses this to know what the real range to
+ * write in the target should be.
+ */
+ unsigned int w_target_from;
+ unsigned int w_target_to;
+
+ /*
+ * We could use journal_current_handle() but this is cleaner,
+ * IMHO -Mark
+ */
+ handle_t *w_handle;
+
+ struct buffer_head *w_di_bh;
+
+ struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+{
+ int i;
+
+ for(i = 0; i < num_pages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ mark_page_accessed(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
+ ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+
+ brelse(wc->w_di_bh);
+ kfree(wc);
+}
+
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+ struct ocfs2_super *osb, loff_t pos,
+ unsigned len, struct buffer_head *di_bh)
+{
+ u32 cend;
+ struct ocfs2_write_ctxt *wc;
+
+ wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+ if (!wc)
+ return -ENOMEM;
+
+ wc->w_cpos = pos >> osb->s_clustersize_bits;
+ wc->w_first_new_cpos = UINT_MAX;
+ cend = (pos + len - 1) >> osb->s_clustersize_bits;
+ wc->w_clen = cend - wc->w_cpos + 1;
+ get_bh(di_bh);
+ wc->w_di_bh = di_bh;
+
+ if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+ wc->w_large_pages = 1;
+ else
+ wc->w_large_pages = 0;
+
+ ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
+ *wcp = wc;
+
+ return 0;
+}
+
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+ unsigned int block_start, block_end;
+ struct buffer_head *head, *bh;
+
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ return;
+
+ bh = head = page_buffers(page);
+ block_start = 0;
+ do {
+ block_end = block_start + bh->b_size;
+
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, end;
+
+ start = max(from, block_start);
+ end = min(to, block_end);
+
+ zero_user_segment(page, start, end);
+ set_buffer_uptodate(bh);
+ }
+
+ clear_buffer_new(bh);
+ mark_buffer_dirty(bh);
+ }
+ }
+
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ loff_t user_pos, unsigned user_len)
+{
+ int i;
+ unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+ to = user_pos + user_len;
+ struct page *tmppage;
+
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ tmppage = wc->w_pages[i];
+
+ if (page_has_buffers(tmppage)) {
+ if (ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(wc->w_handle, inode);
+
+ block_commit_write(tmppage, from, to);
+ }
+ }
+}
+
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+ struct ocfs2_write_ctxt *wc,
+ struct page *page, u32 cpos,
+ loff_t user_pos, unsigned user_len,
+ int new)
+{
int ret;
+ unsigned int map_from = 0, map_to = 0;
+ unsigned int cluster_start, cluster_end;
+ unsigned int user_data_from = 0, user_data_to = 0;
+
+ ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
+ &cluster_start, &cluster_end);
+
+ /* treat the write as new if the a hole/lseek spanned across
+ * the page boundary.
+ */
+ new = new | ((i_size_read(inode) <= page_offset(page)) &&
+ (page_offset(page) <= user_pos));
+
+ if (page == wc->w_target_page) {
+ map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+ map_to = map_from + user_len;
+
+ if (new)
+ ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+ cluster_start, cluster_end,
+ new);
+ else
+ ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+ map_from, map_to, new);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- mlog_entry_void();
+ user_data_from = map_from;
+ user_data_to = map_to;
+ if (new) {
+ map_from = cluster_start;
+ map_to = cluster_end;
+ }
+ } else {
+ /*
+ * If we haven't allocated the new page yet, we
+ * shouldn't be writing it out without copying user
+ * data. This is likely a math error from the caller.
+ */
+ BUG_ON(!new);
+
+ map_from = cluster_start;
+ map_to = cluster_end;
+
+ ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+ cluster_start, cluster_end, new);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
/*
- * We get PR data locks even for O_DIRECT. This allows
- * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
- * extending and buffered zeroing writes race. If they did
- * race then the buffered zeroing could be written back after
- * the O_DIRECT I/O. It's one thing to tell people not to mix
- * buffered and O_DIRECT writes, but expecting them to
- * understand that file extension is also an implicit buffered
- * write is too much. By getting the PR we force writeback of
- * the buffered zeroing before proceeding.
+ * Parts of newly allocated pages need to be zero'd.
+ *
+ * Above, we have also rewritten 'to' and 'from' - as far as
+ * the rest of the function is concerned, the entire cluster
+ * range inside of a page needs to be written.
+ *
+ * We can skip this if the page is up to date - it's already
+ * been zero'd from being read in as a hole.
+ */
+ if (new && !PageUptodate(page))
+ ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+ cpos, user_data_from, user_data_to);
+
+ flush_dcache_page(page);
+
+out:
+ return ret;
+}
+
+/*
+ * This function will only grab one clusters worth of pages.
+ */
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+ struct ocfs2_write_ctxt *wc,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
+ struct page *mmap_page)
+{
+ int ret = 0, i;
+ unsigned long start, target_index, end_index, index;
+ struct inode *inode = mapping->host;
+ loff_t last_byte;
+
+ target_index = user_pos >> PAGE_CACHE_SHIFT;
+
+ /*
+ * Figure out how many pages we'll be manipulating here. For
+ * non allocating write, we just change the one
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
+ */
+ if (new) {
+ wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+ start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
+ } else {
+ wc->w_num_pages = 1;
+ start = target_index;
+ }
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ index = start + i;
+
+ if (index == target_index && mmap_page) {
+ /*
+ * ocfs2_pagemkwrite() is a little different
+ * and wants us to directly use the page
+ * passed in.
+ */
+ lock_page(mmap_page);
+
+ /* Exit and let the caller retry */
+ if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
+ unlock_page(mmap_page);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ page_cache_get(mmap_page);
+ wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
+ } else {
+ wc->w_pages[i] = find_or_create_page(mapping, index,
+ GFP_NOFS);
+ if (!wc->w_pages[i]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ wait_for_stable_page(wc->w_pages[i]);
+
+ if (index == target_index)
+ wc->w_target_page = wc->w_pages[i];
+ }
+out:
+ if (ret)
+ wc->w_target_locked = false;
+ return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+ u32 phys, unsigned int unwritten,
+ unsigned int should_zero,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_write_ctxt *wc, u32 cpos,
+ loff_t user_pos, unsigned user_len)
+{
+ int ret, i, new;
+ u64 v_blkno, p_blkno;
+ struct inode *inode = mapping->host;
+ struct ocfs2_extent_tree et;
+
+ new = phys == 0 ? 1 : 0;
+ if (new) {
+ u32 tmp_pos;
+
+ /*
+ * This is safe to call with the page locks - it won't take
+ * any additional semaphores or cluster locks.
+ */
+ tmp_pos = cpos;
+ ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+ &tmp_pos, 1, 0, wc->w_di_bh,
+ wc->w_handle, data_ac,
+ meta_ac, NULL);
+ /*
+ * This shouldn't happen because we must have already
+ * calculated the correct meta data allocation required. The
+ * internal tree allocation code should know how to increase
+ * transaction credits itself.
+ *
+ * If need be, we could handle -EAGAIN for a
+ * RESTART_TRANS here.
+ */
+ mlog_bug_on_msg(ret == -EAGAIN,
+ "Inode %llu: EAGAIN return during allocation.\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (unwritten) {
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ wc->w_di_bh);
+ ret = ocfs2_mark_extent_written(inode, &et,
+ wc->w_handle, cpos, 1, phys,
+ meta_ac, &wc->w_dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (should_zero)
+ v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+ else
+ v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+
+ /*
+ * The only reason this should fail is due to an inability to
+ * find the extent added.
*/
- ret = ocfs2_data_lock(inode, 0);
+ ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+ NULL);
if (ret < 0) {
+ ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+ "at logical block %llu",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_blkno);
+ goto out;
+ }
+
+ BUG_ON(p_blkno == 0);
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ int tmpret;
+
+ tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+ wc->w_pages[i], cpos,
+ user_pos, user_len,
+ should_zero);
+ if (tmpret) {
+ mlog_errno(tmpret);
+ if (ret == 0)
+ ret = tmpret;
+ }
+ }
+
+ /*
+ * We only have cleanup to do in case of allocating write.
+ */
+ if (ret && new)
+ ocfs2_write_failure(inode, wc, user_pos, user_len);
+
+out:
+
+ return ret;
+}
+
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_write_ctxt *wc,
+ loff_t pos, unsigned len)
+{
+ int ret, i;
+ loff_t cluster_off;
+ unsigned int local_len = len;
+ struct ocfs2_write_cluster_desc *desc;
+ struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
+
+ for (i = 0; i < wc->w_clen; i++) {
+ desc = &wc->w_desc[i];
+
+ /*
+ * We have to make sure that the total write passed in
+ * doesn't extend past a single cluster.
+ */
+ local_len = len;
+ cluster_off = pos & (osb->s_clustersize - 1);
+ if ((cluster_off + local_len) > osb->s_clustersize)
+ local_len = osb->s_clustersize - cluster_off;
+
+ ret = ocfs2_write_cluster(mapping, desc->c_phys,
+ desc->c_unwritten,
+ desc->c_needs_zero,
+ data_ac, meta_ac,
+ wc, desc->c_cpos, pos, local_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ len -= local_len;
+ pos += local_len;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+ struct ocfs2_write_ctxt *wc,
+ loff_t pos, unsigned len, int alloc)
+{
+ struct ocfs2_write_cluster_desc *desc;
+
+ wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+ wc->w_target_to = wc->w_target_from + len;
+
+ if (alloc == 0)
+ return;
+
+ /*
+ * Allocating write - we may have different boundaries based
+ * on page size and cluster size.
+ *
+ * NOTE: We can no longer compute one value from the other as
+ * the actual write length and user provided length may be
+ * different.
+ */
+
+ if (wc->w_large_pages) {
+ /*
+ * We only care about the 1st and last cluster within
+ * our range and whether they should be zero'd or not. Either
+ * value may be extended out to the start/end of a
+ * newly allocated cluster.
+ */
+ desc = &wc->w_desc[0];
+ if (desc->c_needs_zero)
+ ocfs2_figure_cluster_boundaries(osb,
+ desc->c_cpos,
+ &wc->w_target_from,
+ NULL);
+
+ desc = &wc->w_desc[wc->w_clen - 1];
+ if (desc->c_needs_zero)
+ ocfs2_figure_cluster_boundaries(osb,
+ desc->c_cpos,
+ NULL,
+ &wc->w_target_to);
+ } else {
+ wc->w_target_from = 0;
+ wc->w_target_to = PAGE_CACHE_SIZE;
+ }
+}
+
+/*
+ * Populate each single-cluster write descriptor in the write context
+ * with information about the i/o to be done.
+ *
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
+ */
+static int ocfs2_populate_write_desc(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ unsigned int *clusters_to_alloc,
+ unsigned int *extents_to_split)
+{
+ int ret;
+ struct ocfs2_write_cluster_desc *desc;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+ u32 phys = 0;
+ int i;
+
+ *clusters_to_alloc = 0;
+ *extents_to_split = 0;
+
+ for (i = 0; i < wc->w_clen; i++) {
+ desc = &wc->w_desc[i];
+ desc->c_cpos = wc->w_cpos + i;
+
+ if (num_clusters == 0) {
+ /*
+ * Need to look up the next extent record.
+ */
+ ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* We should already CoW the refcountd extent. */
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
+ /*
+ * Assume worst case - that we're writing in
+ * the middle of the extent.
+ *
+ * We can assume that the write proceeds from
+ * left to right, in which case the extent
+ * insert code is smart enough to coalesce the
+ * next splits into the previous records created.
+ */
+ if (ext_flags & OCFS2_EXT_UNWRITTEN)
+ *extents_to_split = *extents_to_split + 2;
+ } else if (phys) {
+ /*
+ * Only increment phys if it doesn't describe
+ * a hole.
+ */
+ phys++;
+ }
+
+ /*
+ * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+ * file that got extended. w_first_new_cpos tells us
+ * where the newly allocated clusters are so we can
+ * zero them.
+ */
+ if (desc->c_cpos >= wc->w_first_new_cpos) {
+ BUG_ON(phys == 0);
+ desc->c_needs_zero = 1;
+ }
+
+ desc->c_phys = phys;
+ if (phys == 0) {
+ desc->c_new = 1;
+ desc->c_needs_zero = 1;
+ *clusters_to_alloc = *clusters_to_alloc + 1;
+ }
+
+ if (ext_flags & OCFS2_EXT_UNWRITTEN) {
+ desc->c_unwritten = 1;
+ desc->c_needs_zero = 1;
+ }
+
+ num_clusters--;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_write_begin_inline(struct address_space *mapping,
+ struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct page *page;
+ handle_t *handle;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+ page = find_or_create_page(mapping, 0, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ /*
+ * If we don't set w_num_pages then this page won't get unlocked
+ * and freed on cleanup of the write context.
+ */
+ wc->w_pages[0] = wc->w_target_page = page;
+ wc->w_num_pages = 1;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ ocfs2_commit_trans(osb, handle);
+
mlog_errno(ret);
goto out;
}
- ocfs2_data_unlock(inode, 0);
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- inode->i_sb->s_bdev, iov, offset,
- nr_segs,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io);
+ if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ ocfs2_set_inode_data_inline(inode, di);
+
+ if (!PageUptodate(page)) {
+ ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+ if (ret) {
+ ocfs2_commit_trans(osb, handle);
+
+ goto out;
+ }
+ }
+
+ wc->w_handle = handle;
out:
- mlog_exit(ret);
+ return ret;
+}
+
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
+ return 1;
+ return 0;
+}
+
+static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode, loff_t pos,
+ unsigned len, struct page *mmap_page,
+ struct ocfs2_write_ctxt *wc)
+{
+ int ret, written = 0;
+ loff_t end = pos + len;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = NULL;
+
+ trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+ len, (unsigned long long)pos,
+ oi->ip_dyn_features);
+
+ /*
+ * Handle inodes which already have inline data 1st.
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (mmap_page == NULL &&
+ ocfs2_size_fits_inline_data(wc->w_di_bh, end))
+ goto do_inline_write;
+
+ /*
+ * The write won't fit - we have to give this inode an
+ * inline extent list now.
+ */
+ ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Check whether the inode can accept inline data.
+ */
+ if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
+ return 0;
+
+ /*
+ * Check whether the write can fit.
+ */
+ di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ if (mmap_page ||
+ end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
+ return 0;
+
+do_inline_write:
+ ret = ocfs2_write_begin_inline(mapping, inode, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * This signals to the caller that the data can be written
+ * inline.
+ */
+ written = 1;
+out:
+ return written ? written : ret;
+}
+
+/*
+ * This function only does anything for file systems which can't
+ * handle sparse files.
+ *
+ * What we want to do here is fill in any hole between the current end
+ * of allocation and the end of our write. That way the rest of the
+ * write path can treat it as an non-allocating write, which has no
+ * special case code for sparse/nonsparse files.
+ */
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
+ struct ocfs2_write_ctxt *wc)
+{
+ int ret;
+ loff_t newsize = pos + len;
+
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+
+ if (newsize <= i_size_read(inode))
+ return 0;
+
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
+ if (ret)
+ mlog_errno(ret);
+
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
+ return ret;
+}
+
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ mutex_lock(&osb->osb_tl_inode->i_mutex);
+ truncated_clusters = osb->truncated_clusters;
+ mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ struct buffer_head *di_bh, struct page *mmap_page)
+{
+ int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
+ unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
+ struct ocfs2_write_ctxt *wc;
+ struct inode *inode = mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle;
+ struct ocfs2_extent_tree et;
+ int try_free = 1, ret1;
+
+try_again:
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (ocfs2_supports_inline_data(osb)) {
+ ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
+ mmap_page, wc);
+ if (ret == 1) {
+ ret = 0;
+ goto success;
+ }
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_check_range_for_refcount(inode, pos, len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ } else if (ret == 1) {
+ clusters_need = wc->w_clen;
+ ret = ocfs2_refcount_cow(inode, di_bh,
+ wc->w_cpos, wc->w_clen, UINT_MAX);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+ &extents_to_split);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ clusters_need += clusters_to_alloc;
+
+ di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+ trace_ocfs2_write_begin_nolock(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode),
+ le32_to_cpu(di->i_clusters),
+ pos, len, flags, mmap_page,
+ clusters_to_alloc, extents_to_split);
+
+ /*
+ * We set w_target_from, w_target_to here so that
+ * ocfs2_write_end() knows which range in the target page to
+ * write out. An allocation requires that we write the entire
+ * cluster range.
+ */
+ if (clusters_to_alloc || extents_to_split) {
+ /*
+ * XXX: We are stretching the limits of
+ * ocfs2_lock_allocators(). It greatly over-estimates
+ * the work to be done.
+ */
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ wc->w_di_bh);
+ ret = ocfs2_lock_allocators(inode, &et,
+ clusters_to_alloc, extents_to_split,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac)
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &di->id2.i_list);
+
+ }
+
+ /*
+ * We have to zero sparse allocated clusters, unwritten extent clusters,
+ * and non-sparse clusters we just extended. For non-sparse writes,
+ * we know zeros will only be needed in the first and/or last cluster.
+ */
+ if (clusters_to_alloc || extents_to_split ||
+ (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ cluster_of_pages = 1;
+ else
+ cluster_of_pages = 0;
+
+ ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ wc->w_handle = handle;
+
+ if (clusters_to_alloc) {
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+ if (ret)
+ goto out_commit;
+ }
+ /*
+ * We don't want this to fail in ocfs2_write_end(), so do it
+ * here.
+ */
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_quota;
+ }
+
+ /*
+ * Fill our page array first. That way we've grabbed enough so
+ * that we can zero and flush if we error after adding the
+ * extent.
+ */
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
+ cluster_of_pages, mmap_page);
+ if (ret && ret != -EAGAIN) {
+ mlog_errno(ret);
+ goto out_quota;
+ }
+
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
+ ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+ len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_quota;
+ }
+
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+success:
+ *pagep = wc->w_target_page;
+ *fsdata = wc;
+ return 0;
+out_quota:
+ if (clusters_to_alloc)
+ dquot_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ ocfs2_free_write_ctxt(wc);
+
+ if (data_ac) {
+ ocfs2_free_alloc_context(data_ac);
+ data_ac = NULL;
+ }
+ if (meta_ac) {
+ ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+
+ if (ret == -ENOSPC && try_free) {
+ /*
+ * Try to free some truncate log so that we can have enough
+ * clusters to allocate.
+ */
+ try_free = 0;
+
+ ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+ if (ret1 == 1)
+ goto try_again;
+
+ if (ret1 < 0)
+ mlog_errno(ret1);
+ }
+
+ return ret;
+}
+
+static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct inode *inode = mapping->host;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /*
+ * Take alloc sem here to prevent concurrent lookups. That way
+ * the mapping, zeroing and tree manipulation within
+ * ocfs2_write() will be safe against ->readpage(). This
+ * should also serve to lock out allocation from a shared
+ * writeable region.
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
+ fsdata, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_fail;
+ }
+
+ brelse(di_bh);
+
+ return 0;
+
+out_fail:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
+
+ return ret;
+}
+
+static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
+ unsigned len, unsigned *copied,
+ struct ocfs2_dinode *di,
+ struct ocfs2_write_ctxt *wc)
+{
+ void *kaddr;
+
+ if (unlikely(*copied < len)) {
+ if (!PageUptodate(wc->w_target_page)) {
+ *copied = 0;
+ return;
+ }
+ }
+
+ kaddr = kmap_atomic(wc->w_target_page);
+ memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
+ kunmap_atomic(kaddr);
+
+ trace_ocfs2_write_end_inline(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)pos, *copied,
+ le16_to_cpu(di->id2.i_data.id_count),
+ le16_to_cpu(di->i_dyn_features));
+}
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int i;
+ unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+ struct inode *inode = mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_write_ctxt *wc = fsdata;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ handle_t *handle = wc->w_handle;
+ struct page *tmppage;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
+ goto out_write_size;
+ }
+
+ if (unlikely(copied < len)) {
+ if (!PageUptodate(wc->w_target_page))
+ copied = 0;
+
+ ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+ start+len);
+ }
+ flush_dcache_page(wc->w_target_page);
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ tmppage = wc->w_pages[i];
+
+ if (tmppage == wc->w_target_page) {
+ from = wc->w_target_from;
+ to = wc->w_target_to;
+
+ BUG_ON(from > PAGE_CACHE_SIZE ||
+ to > PAGE_CACHE_SIZE ||
+ to < from);
+ } else {
+ /*
+ * Pages adjacent to the target (if any) imply
+ * a hole-filling write in which case we want
+ * to flush their entire range.
+ */
+ from = 0;
+ to = PAGE_CACHE_SIZE;
+ }
+
+ if (page_has_buffers(tmppage)) {
+ if (ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ block_commit_write(tmppage, from, to);
+ }
+ }
+
+out_write_size:
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
+
+ ocfs2_commit_trans(osb, handle);
+
+ ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
+ ocfs2_free_write_ctxt(wc);
+
+ return copied;
+}
+
+static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int ret;
+ struct inode *inode = mapping->host;
+
+ ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+
return ret;
}
const struct address_space_operations ocfs2_aops = {
- .readpage = ocfs2_readpage,
- .writepage = ocfs2_writepage,
- .prepare_write = ocfs2_prepare_write,
- .commit_write = ocfs2_commit_write,
- .bmap = ocfs2_bmap,
- .sync_page = block_sync_page,
- .direct_IO = ocfs2_direct_IO
+ .readpage = ocfs2_readpage,
+ .readpages = ocfs2_readpages,
+ .writepage = ocfs2_writepage,
+ .write_begin = ocfs2_write_begin,
+ .write_end = ocfs2_write_end,
+ .bmap = ocfs2_bmap,
+ .direct_IO = ocfs2_direct_IO,
+ .invalidatepage = block_invalidatepage,
+ .releasepage = ocfs2_releasepage,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index e88c3f0b8fa..6cae155d54d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,20 +22,84 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to);
+#include <linux/aio.h>
-struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
+handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
unsigned to);
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ struct inode *inode, unsigned int from,
+ unsigned int to, int new);
+
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
+
+int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh));
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ struct buffer_head *di_bh, struct page *mmap_page);
+
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+ struct buffer_head *di_bh);
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
+
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
- set_bit(0, (unsigned long *)&iocb->private)
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
+{
+ set_bit(0, (unsigned long *)&iocb->private);
+ if (level)
+ set_bit(1, (unsigned long *)&iocb->private);
+ else
+ clear_bit(1, (unsigned long *)&iocb->private);
+}
+
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication between
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+ OCFS2_IOCB_RW_LOCK = 0,
+ OCFS2_IOCB_RW_LOCK_LEVEL,
+ OCFS2_IOCB_SEM,
+ OCFS2_IOCB_UNALIGNED_IO,
+ OCFS2_IOCB_NUM_LOCKS
+};
+
#define ocfs2_iocb_clear_rw_locked(iocb) \
- clear_bit(0, (unsigned long *)&iocb->private)
+ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_rw_locked_level(iocb) \
+ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+ set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+ clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+ test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+ clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 00000000000..0725e605465
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,647 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer. Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offset by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based. This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
+ * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added. This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+ unsigned int b, p = 0;
+
+ /*
+ * Data bits are 0-based, but we're talking code bits, which
+ * are 1-based.
+ */
+ b = i + 1;
+
+ /* Use the cache if it is there */
+ if (p_cache)
+ p = *p_cache;
+ b += p;
+
+ /*
+ * For every power of two below our bit number, bump our bit.
+ *
+ * We compare with (b + 1) because we have to compare with what b
+ * would be _if_ it were bumped up by the parity bit. Capice?
+ *
+ * p is set above.
+ */
+ for (; (1 << p) < (b + 1); p++)
+ b++;
+
+ if (p_cache)
+ *p_cache = p;
+
+ return b;
+}
+
+/*
+ * This is the low level encoder function. It can be called across
+ * multiple hunks just like the crc32 code. 'd' is the number of bits
+ * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+ unsigned int i, b, p = 0;
+
+ BUG_ON(!d);
+
+ /*
+ * b is the hamming code bit number. Hamming code specifies a
+ * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
+ * for the algorithm.
+ *
+ * The i++ in the for loop is so that the start offset passed
+ * to ocfs2_find_next_bit_set() is one greater than the previously
+ * found bit.
+ */
+ for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+ {
+ /*
+ * i is the offset in this hunk, nr + i is the total bit
+ * offset.
+ */
+ b = calc_code_bit(nr + i, &p);
+
+ /*
+ * Data bits in the resultant code are checked by
+ * parity bits that are part of the bit number
+ * representation. Huh?
+ *
+ * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+ * In other words, the parity bit at position 2^k
+ * checks bits in positions having bit k set in
+ * their binary representation. Conversely, for
+ * instance, bit 13, i.e. 1101(2), is checked by
+ * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+ * </wikipedia>
+ *
+ * Note that 'k' is the _code_ bit number. 'b' in
+ * our loop.
+ */
+ parity ^= b;
+ }
+
+ /* While the data buffer was treated as little endian, the
+ * return value is in host endian. */
+ return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+ return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
+ * offset of the current hunk. If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+ unsigned int fix)
+{
+ unsigned int i, b;
+
+ BUG_ON(!d);
+
+ /*
+ * If the bit to fix has an hweight of 1, it's a parity bit. One
+ * busted parity bit is its own error. Nothing to do here.
+ */
+ if (hweight32(fix) == 1)
+ return;
+
+ /*
+ * nr + d is the bit right past the data hunk we're looking at.
+ * If fix after that, nothing to do
+ */
+ if (fix >= calc_code_bit(nr + d, NULL))
+ return;
+
+ /*
+ * nr is the offset in the data hunk we're starting at. Let's
+ * start b at the offset in the code buffer. See hamming_encode()
+ * for a more detailed description of 'b'.
+ */
+ b = calc_code_bit(nr, NULL);
+ /* If the fix is before this hunk, nothing to do */
+ if (fix < b)
+ return;
+
+ for (i = 0; i < d; i++, b++)
+ {
+ /* Skip past parity bits */
+ while (hweight32(b) == 1)
+ b++;
+
+ /*
+ * i is the offset in this data hunk.
+ * nr + i is the offset in the total data buffer.
+ * b is the offset in the total code buffer.
+ *
+ * Thus, when b == fix, bit i in the current hunk needs
+ * fixing.
+ */
+ if (b == fix)
+ {
+ if (ocfs2_test_bit(i, data))
+ ocfs2_clear_bit(i, data);
+ else
+ ocfs2_set_bit(i, data);
+ break;
+ }
+ }
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+ unsigned int fix)
+{
+ ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+
+/*
+ * Debugfs handling.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+ *val = *(u64 *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+
+static struct dentry *blockcheck_debugfs_create(const char *name,
+ struct dentry *parent,
+ u64 *value)
+{
+ return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+ &blockcheck_fops);
+}
+
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+ if (stats) {
+ debugfs_remove(stats->b_debug_check);
+ stats->b_debug_check = NULL;
+ debugfs_remove(stats->b_debug_failure);
+ stats->b_debug_failure = NULL;
+ debugfs_remove(stats->b_debug_recover);
+ stats->b_debug_recover = NULL;
+ debugfs_remove(stats->b_debug_dir);
+ stats->b_debug_dir = NULL;
+ }
+}
+
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
+{
+ int rc = -EINVAL;
+
+ if (!stats)
+ goto out;
+
+ stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+ if (!stats->b_debug_dir)
+ goto out;
+
+ stats->b_debug_check =
+ blockcheck_debugfs_create("blocks_checked",
+ stats->b_debug_dir,
+ &stats->b_check_count);
+
+ stats->b_debug_failure =
+ blockcheck_debugfs_create("checksums_failed",
+ stats->b_debug_dir,
+ &stats->b_failure_count);
+
+ stats->b_debug_recover =
+ blockcheck_debugfs_create("ecc_recoveries",
+ stats->b_debug_dir,
+ &stats->b_recover_count);
+ if (stats->b_debug_check && stats->b_debug_failure &&
+ stats->b_debug_recover)
+ rc = 0;
+
+out:
+ if (rc)
+ ocfs2_blockcheck_debug_remove(stats);
+ return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
+{
+ return 0;
+}
+
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
+{
+ return ocfs2_blockcheck_debug_install(stats, parent);
+}
+
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+ ocfs2_blockcheck_debug_remove(stats);
+}
+
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+ u64 new_count;
+
+ if (!stats)
+ return;
+
+ spin_lock(&stats->b_lock);
+ stats->b_check_count++;
+ new_count = stats->b_check_count;
+ spin_unlock(&stats->b_lock);
+
+ if (!new_count)
+ mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+ u64 new_count;
+
+ if (!stats)
+ return;
+
+ spin_lock(&stats->b_lock);
+ stats->b_failure_count++;
+ new_count = stats->b_failure_count;
+ spin_unlock(&stats->b_lock);
+
+ if (!new_count)
+ mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+ u64 new_count;
+
+ if (!stats)
+ return;
+
+ spin_lock(&stats->b_lock);
+ stats->b_recover_count++;
+ new_count = stats->b_recover_count;
+ spin_unlock(&stats->b_lock);
+
+ if (!new_count)
+ mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+
+
+
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked. bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information. If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc)
+{
+ u32 crc;
+ u32 ecc;
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ crc = crc32_le(~0, data, blocksize);
+ ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+ /*
+ * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+ * larger than 16 bits.
+ */
+ BUG_ON(ecc > USHRT_MAX);
+
+ bc->bc_crc32e = cpu_to_le32(crc);
+ bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information. Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats)
+{
+ int rc = 0;
+ u32 bc_crc32e;
+ u16 bc_ecc;
+ u32 crc, ecc;
+
+ ocfs2_blockcheck_inc_check(stats);
+
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ /* Fast path - if the crc32 validates, we're good to go */
+ crc = crc32_le(~0, data, blocksize);
+ if (crc == bc_crc32e)
+ goto out;
+
+ ocfs2_blockcheck_inc_failure(stats);
+ mlog(ML_ERROR,
+ "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
+
+ /* Ok, try ECC fixups */
+ ecc = ocfs2_hamming_encode_block(data, blocksize);
+ ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
+
+ /* And check the crc32 again */
+ crc = crc32_le(~0, data, blocksize);
+ if (crc == bc_crc32e) {
+ ocfs2_blockcheck_inc_recover(stats);
+ goto out;
+ }
+
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
+
+ rc = -EIO;
+
+out:
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
+
+ return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked. bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information. If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int i;
+ u32 crc, ecc;
+
+ BUG_ON(nr < 0);
+
+ if (!nr)
+ return;
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ /*
+ * The number of bits in a buffer is obviously b_size*8.
+ * The offset of this buffer is b_size*i, so the bit offset
+ * of this buffer is b_size*8*i.
+ */
+ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+ bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i);
+ }
+
+ /*
+ * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+ * larger than 16 bits.
+ */
+ BUG_ON(ecc > USHRT_MAX);
+
+ bc->bc_crc32e = cpu_to_le32(crc);
+ bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads. Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes. If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats)
+{
+ int i, rc = 0;
+ u32 bc_crc32e;
+ u16 bc_ecc;
+ u32 crc, ecc, fix;
+
+ BUG_ON(nr < 0);
+
+ if (!nr)
+ return 0;
+
+ ocfs2_blockcheck_inc_check(stats);
+
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ /* Fast path - if the crc32 validates, we're good to go */
+ for (i = 0, crc = ~0; i < nr; i++)
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ if (crc == bc_crc32e)
+ goto out;
+
+ ocfs2_blockcheck_inc_failure(stats);
+ mlog(ML_ERROR,
+ "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
+
+ /* Ok, try ECC fixups */
+ for (i = 0, ecc = 0; i < nr; i++) {
+ /*
+ * The number of bits in a buffer is obviously b_size*8.
+ * The offset of this buffer is b_size*i, so the bit offset
+ * of this buffer is b_size*8*i.
+ */
+ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+ bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i);
+ }
+ fix = ecc ^ bc_ecc;
+ for (i = 0; i < nr; i++) {
+ /*
+ * Try the fix against each buffer. It will only affect
+ * one of them.
+ */
+ ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i, fix);
+ }
+
+ /* And check the crc32 again */
+ for (i = 0, crc = ~0; i < nr; i++)
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ if (crc == bc_crc32e) {
+ ocfs2_blockcheck_inc_recover(stats);
+ goto out;
+ }
+
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
+
+ rc = -EIO;
+
+out:
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
+
+ return rc;
+}
+
+/*
+ * These are the main API. They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc)
+{
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ if (ocfs2_meta_ecc(osb))
+ rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+ &osb->osb_ecc_stats);
+
+ return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ if (ocfs2_meta_ecc(osb))
+ rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+ &osb->osb_ecc_stats);
+
+ return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 00000000000..d4b69febf70
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,107 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+ spinlock_t b_lock;
+ u64 b_check_count; /* Number of blocks we've checked */
+ u64 b_failure_count; /* Number of failed checksums */
+ u64 b_recover_count; /* Number of blocks fixed by ecc */
+
+ /*
+ * debugfs entries, used if this is passed to
+ * ocfs2_blockcheck_stats_debugfs_install()
+ */
+ struct dentry *b_debug_dir; /* Parent of the debugfs files */
+ struct dentry *b_debug_check; /* Exposes b_check_count */
+ struct dentry *b_debug_failure; /* Exposes b_failure_count */
+ struct dentry *b_debug_recover; /* Exposes b_recover_count */
+};
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats);
+
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function. It can be called across
+ * multiple hunks just like the crc32 code. 'd' is the number of bits
+ * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+ unsigned int nr);
+/*
+ * Fix a buffer with a bit error. The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
+ * offset of the current hunk. If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+ unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+ unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c9037414f4f..1edcb141f63 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
@@ -36,16 +35,27 @@
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
-
#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
+
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+ BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
- struct inode *inode)
+ struct ocfs2_caching_info *ci)
{
int ret = 0;
- mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
- (unsigned long long)bh->b_blocknr, inode);
+ trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
@@ -55,10 +65,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* can get modified during recovery even if read-only. */
if (ocfs2_is_hard_readonly(osb)) {
ret = -EROFS;
+ mlog_errno(ret);
goto out;
}
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
lock_buffer(bh);
set_buffer_uptodate(bh);
@@ -66,44 +77,119 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
- get_bh(bh); /* for end_buffer_write_sync() */
+ get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh)) {
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(ci, bh);
} else {
/* We don't need to remove the clustered uptodate
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- brelse(bh);
+ mlog_errno(ret);
}
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
out:
- mlog_exit(ret);
return ret;
}
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+ unsigned int nr, struct buffer_head *bhs[])
+{
+ int status = 0;
+ unsigned int i;
+ struct buffer_head *bh;
+
+ trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+ if (!nr)
+ goto bail;
+
+ for (i = 0 ; i < nr ; i++) {
+ if (bhs[i] == NULL) {
+ bhs[i] = sb_getblk(osb->sb, block++);
+ if (bhs[i] == NULL) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ bh = bhs[i];
+
+ if (buffer_jbd(bh)) {
+ trace_ocfs2_read_blocks_sync_jbd(
+ (unsigned long long)bh->b_blocknr);
+ continue;
+ }
+
+ if (buffer_dirty(bh)) {
+ /* This should probably be a BUG, or
+ * at least return an error. */
+ mlog(ML_ERROR,
+ "trying to sync read a dirty "
+ "buffer! (blocknr = %llu), skipping\n",
+ (unsigned long long)bh->b_blocknr);
+ continue;
+ }
+
+ lock_buffer(bh);
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "block %llu had the JBD bit set "
+ "while I was in lock_buffer!",
+ (unsigned long long)bh->b_blocknr);
+ BUG();
+ }
+
+ clear_buffer_uptodate(bh);
+ get_bh(bh); /* for end_buffer_read_sync() */
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ, bh);
+ }
+
+ for (i = nr; i > 0; i--) {
+ bh = bhs[i - 1];
+
+ /* No need to wait on the buffer if it's managed by JBD. */
+ if (!buffer_jbd(bh))
+ wait_on_buffer(bh);
+
+ if (!buffer_uptodate(bh)) {
+ /* Status won't be cleared from here on out,
+ * so we can safely record this and loop back
+ * to cleanup the other buffers. */
+ status = -EIO;
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ }
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
- struct inode *inode)
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
int status = 0;
- struct super_block *sb;
int i, ignore_cache = 0;
struct buffer_head *bh;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
- mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
- (unsigned long long)block, nr, flags, inode);
+ trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
+ BUG_ON(!ci);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
- (!inode || !(flags & OCFS2_BH_CACHED)));
+ (flags & OCFS2_BH_IGNORE_CACHE));
- if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+ if (bhs == NULL) {
status = -EINVAL;
mlog_errno(status);
goto bail;
@@ -117,31 +203,23 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
}
if (nr == 0) {
- mlog(ML_BH_IO, "No buffers will be read!\n");
status = 0;
goto bail;
}
- sb = osb->sb;
-
- if (flags & OCFS2_BH_CACHED && !inode)
- flags &= ~OCFS2_BH_CACHED;
-
- if (inode)
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
- if (inode)
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
- status = -EIO;
+ ocfs2_metadata_cache_io_unlock(ci);
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
- ignore_cache = 0;
+ ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
/* There are three read-ahead cases here which we need to
* be concerned with. All three assume a buffer has
@@ -167,32 +245,26 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* before our is-it-in-flight check.
*/
- if (flags & OCFS2_BH_CACHED &&
- !ocfs2_buffer_uptodate(inode, bh)) {
- mlog(ML_UPTODATE,
- "bh (%llu), inode %llu not uptodate\n",
+ if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
+ trace_ocfs2_read_blocks_from_disk(
(unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
+ /* We're using ignore_cache here to say
+ * "go to disk" */
ignore_cache = 1;
}
- /* XXX: Can we ever get this and *not* have the cached
- * flag set? */
+ trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+ ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
if (buffer_jbd(bh)) {
- if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
- mlog(ML_BH_IO, "trying to sync read a jbd "
- "managed bh (blocknr = %llu)\n",
- (unsigned long long)bh->b_blocknr);
continue;
}
- if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+ if (ignore_cache) {
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
- mlog(ML_BH_IO, "asking me to sync read a dirty "
- "buffer! (blocknr = %llu)\n",
- (unsigned long long)bh->b_blocknr);
continue;
}
@@ -201,7 +273,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* previously submitted request than we are
* done here. */
if ((flags & OCFS2_BH_READAHEAD)
- && ocfs2_buffer_read_ahead(inode, bh))
+ && ocfs2_buffer_read_ahead(ci, bh))
continue;
lock_buffer(bh);
@@ -221,15 +293,17 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* previously read-ahead buffer may have
* completed I/O while we were waiting for the
* buffer lock. */
- if ((flags & OCFS2_BH_CACHED)
+ if (!(flags & OCFS2_BH_IGNORE_CACHE)
&& !(flags & OCFS2_BH_READAHEAD)
- && ocfs2_buffer_uptodate(inode, bh)) {
+ && ocfs2_buffer_uptodate(ci, bh)) {
unlock_buffer(bh);
continue;
}
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
+ if (validate)
+ set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
continue;
@@ -243,7 +317,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
if (!(flags & OCFS2_BH_READAHEAD)) {
/* We know this can't have changed as we hold the
- * inode sem. Avoid doing any work on the bh if the
+ * owner sem. Avoid doing any work on the bh if the
* journal has it. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -256,27 +330,98 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
- brelse(bh);
+ put_bh(bh);
bhs[i] = NULL;
continue;
}
+
+ if (buffer_needs_validate(bh)) {
+ /* We never set NeedsValidate if the
+ * buffer was held by the journal, so
+ * that better not have changed */
+ BUG_ON(buffer_jbd(bh));
+ clear_buffer_needs_validate(bh);
+ status = validate(sb, bh);
+ if (status) {
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
+ }
}
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- if (inode)
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(ci, bh);
}
- if (inode)
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
- mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
- (unsigned long long)block, nr,
- (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+ trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+ flags, ignore_cache);
bail:
- mlog_exit(status);
return status;
}
+
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+ sector_t blkno)
+{
+ int i;
+ u64 backup_blkno;
+
+ if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+ return;
+
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+ backup_blkno = ocfs2_backup_super_blkno(sb, i);
+ if (backup_blkno == blkno)
+ return;
+ }
+
+ BUG();
+}
+
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+ struct buffer_head *bh)
+{
+ int ret = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ BUG_ON(buffer_jbd(bh));
+ ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+ ret = -EROFS;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ lock_buffer(bh);
+ set_buffer_uptodate(bh);
+
+ /* remove from dirty list before I/O. */
+ clear_buffer_dirty(bh);
+
+ get_bh(bh); /* for end_buffer_write_sync() */
+ bh->b_end_io = end_buffer_write_sync;
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
+ submit_bh(WRITE, bh);
+
+ wait_on_buffer(bh);
+
+ if (!buffer_uptodate(bh)) {
+ ret = -EIO;
+ mlog_errno(ret);
+ }
+
+out:
+ return ret;
+}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6cc20930fac..b97bcc6dde7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,29 +31,34 @@
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
-static inline int ocfs2_read_block(struct ocfs2_super *osb,
- u64 off,
- struct buffer_head **bh,
- int flags,
- struct inode *inode);
-
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
- struct inode *inode);
-int ocfs2_read_blocks(struct ocfs2_super *osb,
- u64 block,
- int nr,
- struct buffer_head *bhs[],
- int flags,
- struct inode *inode);
+ struct ocfs2_caching_info *ci);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+ unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk. It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh));
-#define OCFS2_BH_CACHED 1
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+ struct buffer_head *bh);
+
+#define OCFS2_BH_IGNORE_CACHE 1
#define OCFS2_BH_READAHEAD 8
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
- struct buffer_head **bh, int flags,
- struct inode *inode)
+static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
+ struct buffer_head **bh,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
int status = 0;
@@ -63,8 +68,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(osb, off, 1, bh,
- flags, inode);
+ status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
bail:
return status;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f1365..1aefc0350ec 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o ver.o
+ quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
deleted file mode 100644
index 2df9082f4e3..00000000000
--- a/fs/ocfs2/cluster/endian.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_CLUSTER_ENDIAN_H
-#define OCFS2_CLUSTER_ENDIAN_H
-
-static inline void be32_add_cpu(__be32 *var, u32 val)
-{
- *var = cpu_to_be32(be32_to_cpu(*var) + val);
-}
-
-#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 305cba3681f..73039295d0d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -33,6 +33,9 @@
#include <linux/random.h>
#include <linux/crc32.h>
#include <linux/time.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -60,6 +63,54 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * - o2hb_region_bitmap allows us to limit the region number to max region.
+ * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * heartbeat on it.
+ * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES 0
+#define O2HB_DB_TYPE_LIVEREGIONS 1
+#define O2HB_DB_TYPE_QUORUMREGIONS 2
+#define O2HB_DB_TYPE_FAILEDREGIONS 3
+#define O2HB_DB_TYPE_REGION_LIVENODES 4
+#define O2HB_DB_TYPE_REGION_NUMBER 5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
+#define O2HB_DB_TYPE_REGION_PINNED 7
+struct o2hb_debug_buf {
+ int db_type;
+ int db_size;
+ int db_len;
+ void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
+#define O2HB_DEBUG_DIR "o2hb"
+#define O2HB_DEBUG_LIVENODES "livenodes"
+#define O2HB_DEBUG_LIVEREGIONS "live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER "num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED "pinned"
+
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
+
static LIST_HEAD(o2hb_all_regions);
static struct o2hb_callback {
@@ -70,9 +121,48 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
#define O2HB_DEFAULT_BLOCK_BITS 9
+enum o2hb_heartbeat_modes {
+ O2HB_HEARTBEAT_LOCAL = 0,
+ O2HB_HEARTBEAT_GLOBAL,
+ O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
+};
+
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF 3
-/* Only sets a new threshold if there are no active regions.
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
+/* Only sets a new threshold if there are no active regions.
*
* No locking or otherwise interesting code is required for reading
* o2hb_dead_threshold as it can't change once regions are active and
@@ -87,6 +177,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
+{
+ int ret = -1;
+
+ if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+ spin_lock(&o2hb_live_lock);
+ if (list_empty(&o2hb_all_regions)) {
+ o2hb_heartbeat_mode = hb_mode;
+ ret = 0;
+ }
+ spin_unlock(&o2hb_live_lock);
+ }
+
+ return ret;
+}
+
struct o2hb_node_event {
struct list_head hn_item;
enum o2hb_callback_type hn_event_type;
@@ -110,7 +216,10 @@ struct o2hb_region {
struct config_item hr_item;
struct list_head hr_all_item;
- unsigned hr_unclean_stop:1;
+ unsigned hr_unclean_stop:1,
+ hr_aborted_start:1,
+ hr_item_pinned:1,
+ hr_item_dropped:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -128,11 +237,29 @@ struct o2hb_region {
struct block_device *hr_bdev;
struct o2hb_disk_slot *hr_slots;
+ /* live node map of this region */
+ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned int hr_region_num;
+
+ struct dentry *hr_debug_dir;
+ struct dentry *hr_debug_livenodes;
+ struct dentry *hr_debug_regnum;
+ struct dentry *hr_debug_elapsed_time;
+ struct dentry *hr_debug_pinned;
+ struct o2hb_debug_buf *hr_db_livenodes;
+ struct o2hb_debug_buf *hr_db_regnum;
+ struct o2hb_debug_buf *hr_db_elapsed_time;
+ struct o2hb_debug_buf *hr_db_pinned;
+
/* let the person setting up hb wait for it to return until it
* has reached a 'steady' state. This will be fixed when we have
* a more complete api that doesn't lead to this sort of fragility. */
atomic_t hr_steady_iterations;
+ /* terminate o2hb thread if it does not reach steady state
+ * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+ atomic_t hr_unsteady_iterations;
+
char hr_dev_name[BDEVNAME_SIZE];
unsigned int hr_timeout_ms;
@@ -141,7 +268,7 @@ struct o2hb_region {
* recognizes a node going up and down in one iteration */
u64 hr_generation;
- struct work_struct hr_write_timeout_work;
+ struct delayed_work hr_write_timeout_work;
unsigned long hr_last_timeout_start;
/* Used during o2hb_check_slot to hold a copy of the block
@@ -156,20 +283,56 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
-static void o2hb_write_timeout(void *arg)
+static void o2hb_write_timeout(struct work_struct *work)
{
- struct o2hb_region *reg = arg;
+ int failed, quorum;
+ unsigned long flags;
+ struct o2hb_region *reg =
+ container_of(work, struct o2hb_region,
+ hr_write_timeout_work.work);
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
"milliseconds\n", reg->hr_dev_name,
- jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+ jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ failed = bitmap_weight(o2hb_failed_region_bitmap,
+ O2NM_MAX_REGIONS);
+ quorum = bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS);
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+ mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+ quorum, failed);
+
+ /*
+ * Fence if the number of failed regions >= half the number
+ * of quorum regions
+ */
+ if ((failed << 1) < quorum)
+ return;
+ }
+
o2quo_disk_timeout();
}
static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
- mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+ /* Arm writeout only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
+ mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
+ O2HB_MAX_WRITE_TIMEOUT_MS);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ }
cancel_delayed_work(&reg->hr_write_timeout_work);
reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -178,14 +341,12 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
{
- cancel_delayed_work(&reg->hr_write_timeout_work);
- flush_scheduled_work();
+ cancel_delayed_work_sync(&reg->hr_write_timeout_work);
}
-static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
- unsigned int num_ios)
+static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
{
- atomic_set(&wc->wc_num_reqs, num_ios);
+ atomic_set(&wc->wc_num_reqs, 1);
init_completion(&wc->wc_io_complete);
wc->wc_error = 0;
}
@@ -207,15 +368,11 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
static void o2hb_wait_on_io(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc)
{
- struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-
- blk_run_address_space(mapping);
-
+ o2hb_bio_wait_dec(wc, 1);
wait_for_completion(&wc->wc_io_complete);
}
-static int o2hb_bio_end_io(struct bio *bio,
- unsigned int bytes_done,
+static void o2hb_bio_end_io(struct bio *bio,
int error)
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
@@ -225,34 +382,30 @@ static int o2hb_bio_end_io(struct bio *bio,
wc->wc_error = error;
}
- if (bio->bi_size)
- return 1;
-
o2hb_bio_wait_dec(wc, 1);
- return 0;
+ bio_put(bio);
}
/* Setup a Bio to cover I/O against num_slots slots starting at
* start_slot. */
static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc,
- unsigned int start_slot,
- unsigned int num_slots)
+ unsigned int *current_slot,
+ unsigned int max_slots)
{
- int i, nr_vecs, len, first_page, last_page;
+ int len, current_page;
unsigned int vec_len, vec_start;
unsigned int bits = reg->hr_block_bits;
unsigned int spp = reg->hr_slots_per_page;
+ unsigned int cs = *current_slot;
struct bio *bio;
struct page *page;
- nr_vecs = (num_slots + spp - 1) / spp;
-
/* Testing has shown this allocation to take long enough under
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(GFP_ATOMIC, nr_vecs);
+ bio = bio_alloc(GFP_ATOMIC, 16);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -260,137 +413,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
}
/* Must put everything in 512 byte sectors for the bio... */
- bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
+ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
bio->bi_bdev = reg->hr_bdev;
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- first_page = start_slot / spp;
- last_page = first_page + nr_vecs;
- vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
- for(i = first_page; i < last_page; i++) {
- page = reg->hr_slot_data[i];
+ vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+ while(cs < max_slots) {
+ current_page = cs / spp;
+ page = reg->hr_slot_data[current_page];
- vec_len = PAGE_CACHE_SIZE;
- /* last page might be short */
- if (((i + 1) * spp) > (start_slot + num_slots))
- vec_len = ((num_slots + start_slot) % spp) << bits;
- vec_len -= vec_start;
+ vec_len = min(PAGE_CACHE_SIZE - vec_start,
+ (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
- i, vec_len, vec_start);
+ current_page, vec_len, vec_start);
len = bio_add_page(bio, page, vec_len, vec_start);
- if (len != vec_len) {
- bio_put(bio);
- bio = ERR_PTR(-EIO);
-
- mlog(ML_ERROR, "Error adding page to bio i = %d, "
- "vec_len = %u, len = %d\n, start = %u\n",
- i, vec_len, len, vec_start);
- goto bail;
- }
+ if (len != vec_len) break;
+ cs += vec_len / (PAGE_CACHE_SIZE/spp);
vec_start = 0;
}
bail:
+ *current_slot = cs;
return bio;
}
-/*
- * Compute the maximum number of sectors the bdev can handle in one bio,
- * as a power of two.
- *
- * Stolen from oracleasm, thanks Joel!
- */
-static int compute_max_sectors(struct block_device *bdev)
-{
- int max_pages, max_sectors, pow_two_sectors;
-
- struct request_queue *q;
-
- q = bdev_get_queue(bdev);
- max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
- if (max_pages > BIO_MAX_PAGES)
- max_pages = BIO_MAX_PAGES;
- if (max_pages > q->max_phys_segments)
- max_pages = q->max_phys_segments;
- if (max_pages > q->max_hw_segments)
- max_pages = q->max_hw_segments;
- max_pages--; /* Handle I/Os that straddle a page */
-
- if (max_pages) {
- max_sectors = max_pages << (PAGE_SHIFT - 9);
- } else {
- /* If BIO contains 1 or less than 1 page. */
- max_sectors = q->max_sectors;
- }
- /* Why is fls() 1-based???? */
- pow_two_sectors = 1 << (fls(max_sectors) - 1);
-
- return pow_two_sectors;
-}
-
-static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
- unsigned int num_slots,
- unsigned int *num_bios,
- unsigned int *slots_per_bio)
-{
- unsigned int max_sectors, io_sectors;
-
- max_sectors = compute_max_sectors(reg->hr_bdev);
-
- io_sectors = num_slots << (reg->hr_block_bits - 9);
-
- *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
- *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
-
- mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
- "device can handle %u sectors of I/O\n", io_sectors, num_slots,
- max_sectors);
- mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
- *num_bios, *slots_per_bio);
-}
-
static int o2hb_read_slots(struct o2hb_region *reg,
unsigned int max_slots)
{
- unsigned int num_bios, slots_per_bio, start_slot, num_slots;
- int i, status;
+ unsigned int current_slot=0;
+ int status;
struct o2hb_bio_wait_ctxt wc;
- struct bio **bios;
struct bio *bio;
- o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
-
- bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
- if (!bios) {
- status = -ENOMEM;
- mlog_errno(status);
- return status;
- }
-
- o2hb_bio_wait_init(&wc, num_bios);
+ o2hb_bio_wait_init(&wc);
- num_slots = slots_per_bio;
- for(i = 0; i < num_bios; i++) {
- start_slot = i * slots_per_bio;
-
- /* adjust num_slots at last bio */
- if (max_slots < (start_slot + num_slots))
- num_slots = max_slots - start_slot;
-
- bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
+ while(current_slot < max_slots) {
+ bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
if (IS_ERR(bio)) {
- o2hb_bio_wait_dec(&wc, num_bios - i);
-
status = PTR_ERR(bio);
mlog_errno(status);
goto bail_and_wait;
}
- bios[i] = bio;
+ atomic_inc(&wc.wc_num_reqs);
submit_bio(READ, bio);
}
@@ -401,38 +470,30 @@ bail_and_wait:
if (wc.wc_error && !status)
status = wc.wc_error;
- if (bios) {
- for(i = 0; i < num_bios; i++)
- if (bios[i])
- bio_put(bios[i]);
- kfree(bios);
- }
-
return status;
}
static int o2hb_issue_node_write(struct o2hb_region *reg,
- struct bio **write_bio,
struct o2hb_bio_wait_ctxt *write_wc)
{
int status;
unsigned int slot;
struct bio *bio;
- o2hb_bio_wait_init(write_wc, 1);
+ o2hb_bio_wait_init(write_wc);
slot = o2nm_this_node();
- bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
+ bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
goto bail;
}
- submit_bio(WRITE, bio);
+ atomic_inc(&write_wc->wc_num_reqs);
+ submit_bio(WRITE_SYNC, bio);
- *write_bio = bio;
status = 0;
bail:
return status;
@@ -477,27 +538,50 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
return read == computed;
}
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
{
- int node_num, ret;
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
+ char *errstr;
- node_num = o2nm_this_node();
-
- ret = 1;
- slot = &reg->hr_slots[node_num];
+ slot = &reg->hr_slots[o2nm_this_node()];
/* Don't check on our 1st timestamp */
- if (slot->ds_last_time) {
- hb_block = slot->ds_raw_block;
+ if (!slot->ds_last_time)
+ return 0;
- if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
- ret = 0;
- }
+ hb_block = slot->ds_raw_block;
+ if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
+ le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
+ hb_block->hb_node == slot->ds_node_num)
+ return 1;
- return ret;
+#define ERRSTR1 "Another node is heartbeating on device"
+#define ERRSTR2 "Heartbeat generation mismatch on device"
+#define ERRSTR3 "Heartbeat sequence mismatch on device"
+
+ if (hb_block->hb_node != slot->ds_node_num)
+ errstr = ERRSTR1;
+ else if (le64_to_cpu(hb_block->hb_generation) !=
+ slot->ds_last_generation)
+ errstr = ERRSTR2;
+ else
+ errstr = ERRSTR3;
+
+ mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+ slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
+ (unsigned long long)slot->ds_last_time, hb_block->hb_node,
+ (unsigned long long)le64_to_cpu(hb_block->hb_generation),
+ (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+ return 0;
}
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -528,7 +612,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
hb_block));
mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
- (long long)cpu_to_le64(generation),
+ (long long)generation,
le32_to_cpu(hb_block->hb_cksum));
}
@@ -536,11 +620,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
struct o2nm_node *node,
int idx)
{
- struct list_head *iter;
struct o2hb_callback_func *f;
- list_for_each(iter, &hbcall->list) {
- f = list_entry(iter, struct o2hb_callback_func, hc_item);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
(f->hc_func)(node, idx, f->hc_data);
}
@@ -549,16 +631,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
/* Will run the list in order until we process the passed event */
static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
{
- int empty;
struct o2hb_callback *hbcall;
struct o2hb_node_event *event;
- spin_lock(&o2hb_live_lock);
- empty = list_empty(&queued_event->hn_item);
- spin_unlock(&o2hb_live_lock);
- if (empty)
- return;
-
/* Holding callback sem assures we don't alter the callback
* lists when doing this, and serializes ourselves with other
* processes wanting callbacks. */
@@ -600,6 +675,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
{
assert_spin_locked(&o2hb_live_lock);
+ BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
event->hn_event_type = type;
event->hn_node = node;
event->hn_node_num = node_num;
@@ -615,6 +692,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
struct o2hb_node_event event =
{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
struct o2nm_node *node;
+ int queued = 0;
node = o2nm_get_node_by_num(slot->ds_node_num);
if (!node)
@@ -632,15 +710,60 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
slot->ds_node_num);
+ queued = 1;
}
}
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
o2nm_node_put(node);
}
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
+{
+ if (!o2hb_global_heartbeat_active())
+ return;
+
+ /* Prevent race with o2hb_heartbeat_group_drop_item() */
+ if (kthread_should_stop())
+ return;
+
+ /* Tag region as quorum only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
+ spin_lock(&o2hb_live_lock);
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ goto unlock;
+
+ /*
+ * A region can be added to the quorum only when it sees all
+ * live nodes heartbeat on it. In other words, the region has been
+ * added to all nodes.
+ */
+ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ sizeof(o2hb_live_node_bitmap)))
+ goto unlock;
+
+ printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+ set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+ /*
+ * If global heartbeat active, unpin all regions if the
+ * region count > CUT_OFF
+ */
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+ o2hb_region_unpin(NULL);
+unlock:
+ spin_unlock(&o2hb_live_lock);
+}
+
static int o2hb_check_slot(struct o2hb_region *reg,
struct o2hb_disk_slot *slot)
{
@@ -652,14 +775,23 @@ static int o2hb_check_slot(struct o2hb_region *reg,
u64 cputime;
unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
unsigned int slot_dead_ms;
+ int tmp;
+ int queued = 0;
memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
- /* Is this correct? Do we assume that the node doesn't exist
- * if we're not configured for him? */
+ /*
+ * If a node is no longer configured but is still in the livemap, we
+ * may need to clear that bit from the livemap.
+ */
node = o2nm_get_node_by_num(slot->ds_node_num);
- if (!node)
- return 0;
+ if (!node) {
+ spin_lock(&o2hb_live_lock);
+ tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ if (!tmp)
+ return 0;
+ }
if (!o2hb_verify_crc(reg, hb_block)) {
/* all paths from here will drop o2hb_live_lock for
@@ -712,7 +844,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
"seq %llu last %llu changed %u equal %u\n",
slot->ds_node_num, (long long)slot->ds_last_generation,
le32_to_cpu(hb_block->hb_cksum),
- (unsigned long long)le64_to_cpu(hb_block->hb_seq),
+ (unsigned long long)le64_to_cpu(hb_block->hb_seq),
(unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
slot->ds_equal_samples);
@@ -726,14 +858,19 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
slot->ds_node_num, (long long)slot->ds_last_generation);
+ set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+ "bitmap\n", slot->ds_node_num);
set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
slot->ds_node_num);
changed = 1;
+ queued = 1;
}
list_add_tail(&slot->ds_live_item,
@@ -771,15 +908,21 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d left my region\n",
slot->ds_node_num);
+ clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* last off the live_slot generates a callback */
list_del_init(&slot->ds_live_item);
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+ "nodes bitmap\n", slot->ds_node_num);
clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
- o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
- slot->ds_node_num);
+ /* node can be null */
+ o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+ node, slot->ds_node_num);
changed = 1;
+ queued = 1;
}
/* We don't clear this because the node is still
@@ -795,49 +938,50 @@ fire_callbacks:
out:
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
- o2nm_node_put(node);
+ if (node)
+ o2nm_node_put(node);
return changed;
}
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
- int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
{
- int highest, node;
-
- highest = numbits;
- node = -1;
- while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
- if (node >= numbits)
- break;
-
- highest = node;
- }
-
- return highest;
+ return find_last_bit(nodes, numbits);
}
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node, change = 0;
+ int i, ret, highest_node;
+ int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
- struct bio *write_bio;
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
ret = o2nm_configured_node_map(configured_nodes,
sizeof(configured_nodes));
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
+ }
+
+ /*
+ * If a node is not configured but is in the livemap, we still need
+ * to read the slot so as to be able to remove it from the livemap.
+ */
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ set_bit(i, configured_nodes);
}
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return -EINVAL;
+ mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+ ret = -EINVAL;
+ goto bail;
}
/* No sense in reading the slots of nodes that don't exist
@@ -847,31 +991,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/* With an up to date view of the slots, we can check that no
* other node has been improperly configured to heartbeat in
* our slot. */
- if (!o2hb_check_last_timestamp(reg))
- mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
- "in our slot!\n", reg->hr_dev_name);
+ own_slot_ok = o2hb_check_own_slot(reg);
/* fill in the proper info for our next heartbeat */
o2hb_prepare_block(reg, reg->hr_generation);
- /* And fire off the write. Note that we don't wait on this I/O
- * until later. */
- ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
+ ret = o2hb_issue_node_write(reg, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
i = -1;
- while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
-
- change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+ while((i = find_next_bit(configured_nodes,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
}
/*
@@ -880,25 +1020,45 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* people we find in our steady state have seen us.
*/
o2hb_wait_on_io(reg, &write_wc);
- bio_put(write_bio);
if (write_wc.wc_error) {
/* Do not re-arm the write timeout on I/O error - we
* can't be sure that the new block ever made it to
* disk */
mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
write_wc.wc_error, reg->hr_dev_name);
- return write_wc.wc_error;
+ ret = write_wc.wc_error;
+ goto bail;
}
- o2hb_arm_write_timeout(reg);
+ /* Skip disarming the timeout if own slot has stale/bad data */
+ if (own_slot_ok) {
+ o2hb_set_quorum_device(reg);
+ o2hb_arm_write_timeout(reg);
+ }
+bail:
/* let the person who launched us know when things are steady */
- if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
- if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (!ret && own_slot_ok && !membership_change) {
+ if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ wake_up(&o2hb_steady_queue);
+ }
+ }
+
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+ printk(KERN_NOTICE "o2hb: Unable to stabilize "
+ "heartbeart on region %s (%s)\n",
+ config_item_name(&reg->hr_item),
+ reg->hr_dev_name);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
+ ret = -EIO;
+ }
}
- return 0;
+ return ret;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -941,37 +1101,39 @@ static int o2hb_thread(void *data)
{
int i, ret;
struct o2hb_region *reg = data;
- struct bio *write_bio;
struct o2hb_bio_wait_ctxt write_wc;
struct timeval before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
- set_user_nice(current, -20);
+ set_user_nice(current, MIN_NICE);
- while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+ /* Pin node */
+ o2nm_depend_this_node();
+
+ while (!kthread_should_stop() &&
+ !reg->hr_unclean_stop && !reg->hr_aborted_start) {
/* We track the time spent inside
- * o2hb_do_disk_heartbeat so that we avoid more then
+ * o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
do_gettimeofday(&before_hb);
- i = 0;
- do {
- ret = o2hb_do_disk_heartbeat(reg);
- } while (ret && ++i < 2);
+ ret = o2hb_do_disk_heartbeat(reg);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
- mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+ mlog(ML_HEARTBEAT,
+ "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
elapsed_msec);
- if (elapsed_msec < reg->hr_timeout_ms) {
+ if (!kthread_should_stop() &&
+ elapsed_msec < reg->hr_timeout_ms) {
/* the kthread api has blocked signals for us so no
* need to record the return value. */
msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -988,23 +1150,236 @@ static int o2hb_thread(void *data)
* to timeout on this region when we could just as easily
* write a clear generation - thus indicating to them that
* this node has left this region.
- *
- * XXX: Should we skip this on unclean_stop? */
- o2hb_prepare_block(reg, 0);
- ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
- if (ret == 0) {
- o2hb_wait_on_io(reg, &write_wc);
- bio_put(write_bio);
- } else {
- mlog_errno(ret);
+ */
+ if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+ o2hb_prepare_block(reg, 0);
+ ret = o2hb_issue_node_write(reg, &write_wc);
+ if (ret == 0)
+ o2hb_wait_on_io(reg, &write_wc);
+ else
+ mlog_errno(ret);
+ }
+
+ /* Unpin node */
+ o2nm_undepend_this_node();
+
+ mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
+
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+ struct o2hb_debug_buf *db = inode->i_private;
+ struct o2hb_region *reg;
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long lts;
+ char *buf = NULL;
+ int i = -1;
+ int out = 0;
+
+ /* max_nodes should be the largest bitmap we pass here */
+ BUG_ON(sizeof(map) < db->db_size);
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ goto bail;
+
+ switch (db->db_type) {
+ case O2HB_DB_TYPE_LIVENODES:
+ case O2HB_DB_TYPE_LIVEREGIONS:
+ case O2HB_DB_TYPE_QUORUMREGIONS:
+ case O2HB_DB_TYPE_FAILEDREGIONS:
+ spin_lock(&o2hb_live_lock);
+ memcpy(map, db->db_data, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_LIVENODES:
+ spin_lock(&o2hb_live_lock);
+ reg = (struct o2hb_region *)db->db_data;
+ memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_NUMBER:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ reg->hr_region_num);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+ reg = (struct o2hb_region *)db->db_data;
+ lts = reg->hr_last_timeout_start;
+ /* If 0, it has never been set before */
+ if (lts)
+ lts = jiffies_to_msecs(jiffies - lts);
+ out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_PINNED:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ !!reg->hr_item_pinned);
+ goto done;
+
+ default:
+ goto done;
}
- mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+ while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+done:
+ i_size_write(inode, out);
+ file->private_data = buf;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
return 0;
}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return 0;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static const struct file_operations o2hb_debug_fops = {
+ .open = o2hb_debug_open,
+ .release = o2hb_debug_release,
+ .read = o2hb_debug_read,
+ .llseek = generic_file_llseek,
+};
+
+void o2hb_exit(void)
+{
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
+ debugfs_remove(o2hb_debug_failedregions);
+ debugfs_remove(o2hb_debug_quorumregions);
+ debugfs_remove(o2hb_debug_liveregions);
+ debugfs_remove(o2hb_debug_livenodes);
+ debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len,
+ int type, int size, int len, void *data)
+{
+ *db = kmalloc(db_len, GFP_KERNEL);
+ if (!*db)
+ return NULL;
+
+ (*db)->db_type = type;
+ (*db)->db_size = size;
+ (*db)->db_len = len;
+ (*db)->db_data = data;
+
+ return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+ &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+ int ret = -ENOMEM;
+
+ o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+ if (!o2hb_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ o2hb_debug_dir,
+ &o2hb_db_livenodes,
+ sizeof(*o2hb_db_livenodes),
+ O2HB_DB_TYPE_LIVENODES,
+ sizeof(o2hb_live_node_bitmap),
+ O2NM_MAX_NODES,
+ o2hb_live_node_bitmap);
+ if (!o2hb_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_liveregions,
+ sizeof(*o2hb_db_liveregions),
+ O2HB_DB_TYPE_LIVEREGIONS,
+ sizeof(o2hb_live_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_live_region_bitmap);
+ if (!o2hb_debug_liveregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_quorumregions =
+ o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_quorumregions,
+ sizeof(*o2hb_db_quorumregions),
+ O2HB_DB_TYPE_QUORUMREGIONS,
+ sizeof(o2hb_quorum_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_quorum_region_bitmap);
+ if (!o2hb_debug_quorumregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_failedregions =
+ o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_failedregions,
+ sizeof(*o2hb_db_failedregions),
+ O2HB_DB_TYPE_FAILEDREGIONS,
+ sizeof(o2hb_failed_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_failed_region_bitmap);
+ if (!o2hb_debug_failedregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ if (ret)
+ o2hb_exit();
+
+ return ret;
+}
-void o2hb_init(void)
+int o2hb_init(void)
{
int i;
@@ -1017,6 +1392,14 @@ void o2hb_init(void)
INIT_LIST_HEAD(&o2hb_node_events);
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+ memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+ memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+ memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+ memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+
+ o2hb_dependent_users = 0;
+
+ return o2hb_debug_init();
}
/* if we're already in a callback then we're already serialized by the sem */
@@ -1062,8 +1445,9 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- if (reg->hr_tmp_block)
- kfree(reg->hr_tmp_block);
+ mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
+ kfree(reg->hr_tmp_block);
if (reg->hr_slot_data) {
for (i = 0; i < reg->hr_num_pages; i++) {
@@ -1075,10 +1459,17 @@ static void o2hb_region_release(struct config_item *item)
}
if (reg->hr_bdev)
- blkdev_put(reg->hr_bdev);
+ blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- if (reg->hr_slots)
- kfree(reg->hr_slots);
+ kfree(reg->hr_slots);
+
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_livenodes);
+ debugfs_remove(reg->hr_debug_livenodes);
+ debugfs_remove(reg->hr_debug_regnum);
+ debugfs_remove(reg->hr_debug_elapsed_time);
+ debugfs_remove(reg->hr_debug_pinned);
+ debugfs_remove(reg->hr_debug_dir);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
@@ -1296,8 +1687,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
- mlog_entry_void();
-
ret = o2hb_read_slots(reg, reg->hr_blocks);
if (ret) {
mlog_errno(ret);
@@ -1319,7 +1708,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
}
out:
- mlog_exit(ret);
return ret;
}
@@ -1328,12 +1716,14 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
const char *page,
size_t count)
{
+ struct task_struct *hb_task;
long fd;
int sectsize;
char *p = (char *)page;
- struct file *filp = NULL;
- struct inode *inode = NULL;
+ struct fd f;
+ struct inode *inode;
ssize_t ret = -EINVAL;
+ int live_threshold;
if (reg->hr_bdev)
goto out;
@@ -1350,38 +1740,38 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
if (fd < 0 || fd >= INT_MAX)
goto out;
- filp = fget(fd);
- if (filp == NULL)
+ f = fdget(fd);
+ if (f.file == NULL)
goto out;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out;
+ goto out2;
- inode = igrab(filp->f_mapping->host);
+ inode = igrab(f.file->f_mapping->host);
if (inode == NULL)
- goto out;
+ goto out2;
if (!S_ISBLK(inode->i_mode))
- goto out;
+ goto out3;
- reg->hr_bdev = I_BDEV(filp->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+ reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
+ ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
if (ret) {
reg->hr_bdev = NULL;
- goto out;
+ goto out3;
}
inode = NULL;
bdevname(reg->hr_bdev, reg->hr_dev_name);
- sectsize = bdev_hardsect_size(reg->hr_bdev);
+ sectsize = bdev_logical_block_size(reg->hr_bdev);
if (sectsize != reg->hr_block_bytes) {
mlog(ML_ERROR,
"blocksize %u incorrect for device, expected %d",
reg->hr_block_bytes, sectsize);
ret = -EINVAL;
- goto out;
+ goto out3;
}
o2hb_init_region_params(reg);
@@ -1395,56 +1785,107 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = o2hb_map_slot_data(reg);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out3;
}
ret = o2hb_populate_slot_data(reg);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out3;
}
- INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
+ INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
/*
* A node is considered live after it has beat LIVE_THRESHOLD
* times. We're not steady until we've given them a chance
* _after_ our first read.
+ * The default threshold is bare minimum so as to limit the delay
+ * during mounts. For global heartbeat, the threshold doubled for the
+ * first region.
*/
- atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
-
- reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
- reg->hr_item.ci_name);
- if (IS_ERR(reg->hr_task)) {
- ret = PTR_ERR(reg->hr_task);
+ live_threshold = O2HB_LIVE_THRESHOLD;
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+ live_threshold <<= 1;
+ spin_unlock(&o2hb_live_lock);
+ }
+ ++live_threshold;
+ atomic_set(&reg->hr_steady_iterations, live_threshold);
+ /* unsteady_iterations is double the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+
+ hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
+ reg->hr_item.ci_name);
+ if (IS_ERR(hb_task)) {
+ ret = PTR_ERR(hb_task);
mlog_errno(ret);
- reg->hr_task = NULL;
- goto out;
+ goto out3;
}
+ spin_lock(&o2hb_live_lock);
+ reg->hr_task = hb_task;
+ spin_unlock(&o2hb_live_lock);
+
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
- kthread_stop(reg->hr_task);
- reg->hr_task = NULL;
- goto out;
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
+ }
+
+ if (reg->hr_aborted_start) {
+ ret = -EIO;
+ goto out3;
}
- ret = count;
+ /* Ok, we were woken. Make sure it wasn't by drop_item() */
+ spin_lock(&o2hb_live_lock);
+ hb_task = reg->hr_task;
+ if (o2hb_global_heartbeat_active())
+ set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+
+ if (hb_task)
+ ret = count;
+ else
+ ret = -EIO;
+
+ if (hb_task && o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+out3:
+ iput(inode);
+out2:
+ fdput(f);
out:
- if (filp)
- fput(filp);
- if (inode)
- iput(inode);
if (ret < 0) {
if (reg->hr_bdev) {
- blkdev_put(reg->hr_bdev);
+ blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
reg->hr_bdev = NULL;
}
}
return ret;
}
+static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
+ char *page)
+{
+ pid_t pid = 0;
+
+ spin_lock(&o2hb_live_lock);
+ if (reg->hr_task)
+ pid = task_pid_nr(reg->hr_task);
+ spin_unlock(&o2hb_live_lock);
+
+ if (!pid)
+ return 0;
+
+ return sprintf(page, "%u\n", pid);
+}
+
struct o2hb_region_attribute {
struct configfs_attribute attr;
ssize_t (*show)(struct o2hb_region *, char *);
@@ -1483,11 +1924,19 @@ static struct o2hb_region_attribute o2hb_region_attr_dev = {
.store = o2hb_region_dev_write,
};
+static struct o2hb_region_attribute o2hb_region_attr_pid = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "pid",
+ .ca_mode = S_IRUGO | S_IRUSR },
+ .show = o2hb_region_pid_read,
+};
+
static struct configfs_attribute *o2hb_region_attrs[] = {
&o2hb_region_attr_block_bytes.attr,
&o2hb_region_attr_start_block.attr,
&o2hb_region_attr_blocks.attr,
&o2hb_region_attr_dev.attr,
+ &o2hb_region_attr_pid.attr,
NULL,
};
@@ -1545,42 +1994,176 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+ int ret = -ENOMEM;
+
+ reg->hr_debug_dir =
+ debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+ if (!reg->hr_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_livenodes =
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ reg->hr_debug_dir,
+ &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap),
+ O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_regnum =
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+ reg->hr_debug_dir,
+ &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER,
+ 0, O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_regnum) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_elapsed_time =
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+ reg->hr_debug_dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+ 0, 0, reg);
+ if (!reg->hr_debug_elapsed_time) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_pinned =
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+ reg->hr_debug_dir,
+ &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED,
+ 0, 0, reg);
+ if (!reg->hr_debug_pinned) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ return ret;
+}
+
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
const char *name)
{
struct o2hb_region *reg = NULL;
- struct config_item *ret = NULL;
+ int ret;
- reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
+ reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
if (reg == NULL)
- goto out; /* ENOMEM */
+ return ERR_PTR(-ENOMEM);
- config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
-
- ret = &reg->hr_item;
+ if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+ ret = -ENAMETOOLONG;
+ goto free;
+ }
spin_lock(&o2hb_live_lock);
+ reg->hr_region_num = 0;
+ if (o2hb_global_heartbeat_active()) {
+ reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+ O2NM_MAX_REGIONS);
+ if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+ spin_unlock(&o2hb_live_lock);
+ ret = -EFBIG;
+ goto free;
+ }
+ set_bit(reg->hr_region_num, o2hb_region_bitmap);
+ }
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
-out:
- if (ret == NULL)
- kfree(reg);
- return ret;
+ config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+ ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+ if (ret) {
+ config_item_put(&reg->hr_item);
+ goto free;
+ }
+
+ return &reg->hr_item;
+free:
+ kfree(reg);
+ return ERR_PTR(ret);
}
static void o2hb_heartbeat_group_drop_item(struct config_group *group,
struct config_item *item)
{
+ struct task_struct *hb_task;
struct o2hb_region *reg = to_o2hb_region(item);
+ int quorum_region = 0;
/* stop the thread when the user removes the region dir */
- if (reg->hr_task) {
- kthread_stop(reg->hr_task);
- reg->hr_task = NULL;
+ spin_lock(&o2hb_live_lock);
+ hb_task = reg->hr_task;
+ reg->hr_task = NULL;
+ reg->hr_item_dropped = 1;
+ spin_unlock(&o2hb_live_lock);
+
+ if (hb_task)
+ kthread_stop(hb_task);
+
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+ "stopped" : "start aborted"), config_item_name(item),
+ reg->hr_dev_name);
+ }
+
+ /*
+ * If we're racing a dev_write(), we need to wake them. They will
+ * check reg->hr_task
+ */
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ reg->hr_aborted_start = 1;
+ atomic_set(&reg->hr_steady_iterations, 0);
+ wake_up(&o2hb_steady_queue);
}
config_item_put(item);
+
+ if (!o2hb_global_heartbeat_active() || !quorum_region)
+ return;
+
+ /*
+ * If global heartbeat active and there are dependent users,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ spin_lock(&o2hb_live_lock);
+
+ if (!o2hb_dependent_users)
+ goto unlock;
+
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ o2hb_region_pin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
struct o2hb_heartbeat_group_attribute {
@@ -1640,6 +2223,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+ char *page)
+{
+ return sprintf(page, "%s\n",
+ o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+ const char *page, size_t count)
+{
+ unsigned int i;
+ int ret;
+ size_t len;
+
+ len = (page[count - 1] == '\n') ? count - 1 : count;
+ if (!len)
+ return -EINVAL;
+
+ for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+ if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ continue;
+
+ ret = o2hb_global_heartbeat_mode_set(i);
+ if (!ret)
+ printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+ o2hb_heartbeat_mode_desc[i]);
+ return count;
+ }
+
+ return -EINVAL;
+
+}
+
static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "dead_threshold",
@@ -1648,12 +2266,21 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
.store = o2hb_heartbeat_group_threshold_store,
};
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "mode",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2hb_heartbeat_group_mode_show,
+ .store = o2hb_heartbeat_group_mode_store,
+};
+
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
&o2hb_heartbeat_group_attr_threshold.attr,
+ &o2hb_heartbeat_group_attr_mode.attr,
NULL,
};
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
.show_attribute = o2hb_heartbeat_group_show,
.store_attribute = o2hb_heartbeat_group_store,
};
@@ -1665,7 +2292,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_hearbeat_group_item_ops,
+ .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -1677,7 +2304,7 @@ struct config_group *o2hb_alloc_hb_set(void)
struct o2hb_heartbeat_group *hs = NULL;
struct config_group *ret = NULL;
- hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
+ hs = kzalloc(sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
if (hs == NULL)
goto out;
@@ -1697,7 +2324,7 @@ void o2hb_free_hb_set(struct config_group *group)
kfree(hs);
}
-/* hb callback registration and issueing */
+/* hb callback registration and issuing */
static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
{
@@ -1722,10 +2349,150 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
}
EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-int o2hb_register_callback(struct o2hb_callback_func *hc)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
+{
+ int ret = 0, found = 0;
+ struct o2hb_region *reg;
+ char *uuid;
+
+ assert_spin_locked(&o2hb_live_lock);
+
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ uuid = config_item_name(&reg->hr_item);
+
+ /* local heartbeat */
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
+
+ if (reg->hr_item_pinned || reg->hr_item_dropped)
+ goto skip_pin;
+
+ /* Ignore ENOENT only for local hb (userdlm domain) */
+ ret = o2nm_depend_item(&reg->hr_item);
+ if (!ret) {
+ mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+ reg->hr_item_pinned = 1;
+ } else {
+ if (ret == -ENOENT && found)
+ ret = 0;
+ else {
+ mlog(ML_ERROR, "Pin region %s fails with %d\n",
+ uuid, ret);
+ break;
+ }
+ }
+skip_pin:
+ if (found)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
+{
+ struct o2hb_region *reg;
+ char *uuid;
+ int found = 0;
+
+ assert_spin_locked(&o2hb_live_lock);
+
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ uuid = config_item_name(&reg->hr_item);
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
+
+ if (reg->hr_item_pinned) {
+ mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+ o2nm_undepend_item(&reg->hr_item);
+ reg->hr_item_pinned = 0;
+ }
+ if (found)
+ break;
+ }
+}
+
+static int o2hb_region_inc_user(const char *region_uuid)
+{
+ int ret = 0;
+
+ spin_lock(&o2hb_live_lock);
+
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ ret = o2hb_region_pin(region_uuid);
+ goto unlock;
+ }
+
+ /*
+ * if global heartbeat active and this is the first dependent user,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ o2hb_dependent_users++;
+ if (o2hb_dependent_users > 1)
+ goto unlock;
+
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ ret = o2hb_region_pin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
+ return ret;
+}
+
+void o2hb_region_dec_user(const char *region_uuid)
{
- struct o2hb_callback_func *tmp;
- struct list_head *iter;
+ spin_lock(&o2hb_live_lock);
+
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ o2hb_region_unpin(region_uuid);
+ goto unlock;
+ }
+
+ /*
+ * if global heartbeat active and there are no dependent users,
+ * unpin all quorum regions
+ */
+ o2hb_dependent_users--;
+ if (!o2hb_dependent_users)
+ o2hb_region_unpin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
+}
+
+int o2hb_register_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc)
+{
+ struct o2hb_callback_func *f;
struct o2hb_callback *hbcall;
int ret;
@@ -1738,12 +2505,19 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
goto out;
}
+ if (region_uuid) {
+ ret = o2hb_region_inc_user(region_uuid);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
down_write(&o2hb_callback_sem);
- list_for_each(iter, &hbcall->list) {
- tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
- if (hc->hc_priority < tmp->hc_priority) {
- list_add_tail(&hc->hc_item, iter);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
+ if (hc->hc_priority < f->hc_priority) {
+ list_add_tail(&hc->hc_item, &f->hc_item);
break;
}
}
@@ -1753,29 +2527,32 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
up_write(&o2hb_callback_sem);
ret = 0;
out:
- mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
ret, __builtin_return_address(0), hc);
return ret;
}
EXPORT_SYMBOL_GPL(o2hb_register_callback);
-int o2hb_unregister_callback(struct o2hb_callback_func *hc)
+void o2hb_unregister_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc)
{
BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
- mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
__builtin_return_address(0), hc);
+ /* XXX Can this happen _with_ a region reference? */
if (list_empty(&hc->hc_item))
- return 0;
+ return;
+
+ if (region_uuid)
+ o2hb_region_dec_user(region_uuid);
down_write(&o2hb_callback_sem);
list_del_init(&hc->hc_item);
up_write(&o2hb_callback_sem);
-
- return 0;
}
EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
@@ -1846,3 +2623,37 @@ void o2hb_stop_all_regions(void)
spin_unlock(&o2hb_live_lock);
}
EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+ struct o2hb_region *reg;
+ int numregs = 0;
+ char *p;
+
+ spin_lock(&o2hb_live_lock);
+
+ p = region_uuids;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+ if (numregs < max_regions) {
+ memcpy(p, config_item_name(&reg->hr_item),
+ O2HB_MAX_REGION_NAME_LEN);
+ p += O2HB_MAX_REGION_NAME_LEN;
+ }
+ numregs++;
+ }
+
+ spin_unlock(&o2hb_live_lock);
+
+ return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+ return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cac6223206a..00ad8e8fea5 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,11 +31,13 @@
#define O2HB_REGION_TIMEOUT_MS 2000
+#define O2HB_MAX_REGION_NAME_LEN 32
+
/* number of changes to be seen as live */
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
extern unsigned int o2hb_dead_threshold;
-#define O2HB_DEFAULT_DEAD_THRESHOLD 7
+#define O2HB_DEFAULT_DEAD_THRESHOLD 31
/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
#define O2HB_MIN_DEAD_THRESHOLD 2
#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
@@ -69,14 +71,19 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
o2hb_cb_func *func,
void *data,
int priority);
-int o2hb_register_callback(struct o2hb_callback_func *hc);
-int o2hb_unregister_callback(struct o2hb_callback_func *hc);
+int o2hb_register_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
unsigned bytes);
-void o2hb_init(void);
+void o2hb_exit(void);
+int o2hb_init(void);
int o2hb_check_node_heartbeating(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 636593bf4d1..07ac24fd925 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
EXPORT_SYMBOL_GPL(mlog_and_bits);
-struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
EXPORT_SYMBOL_GPL(mlog_not_bits);
static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -74,15 +74,12 @@ struct mlog_attribute {
#define define_mask(_name) { \
.attr = { \
.name = #_name, \
- .owner = THIS_MODULE, \
.mode = S_IRUGO | S_IWUSR, \
}, \
.mask = ML_##_name, \
}
static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
- define_mask(ENTRY),
- define_mask(EXIT),
define_mask(TCP),
define_mask(MSG),
define_mask(SOCKET),
@@ -94,22 +91,12 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(DLM_THREAD),
define_mask(DLM_MASTER),
define_mask(DLM_RECOVERY),
- define_mask(AIO),
- define_mask(JOURNAL),
- define_mask(DISK_ALLOC),
- define_mask(SUPER),
- define_mask(FILE_IO),
- define_mask(EXTENT_MAP),
define_mask(DLM_GLUE),
- define_mask(BH_IO),
- define_mask(UPTODATE),
- define_mask(NAMEI),
- define_mask(INODE),
define_mask(VOTE),
- define_mask(DCACHE),
define_mask(CONN),
define_mask(QUORUM),
- define_mask(EXPORT),
+ define_mask(BASTS),
+ define_mask(CLUSTER),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
@@ -133,7 +120,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
return mlog_mask_store(mlog_attr->mask, buf, count);
}
-static struct sysfs_ops mlog_attr_ops = {
+static const struct sysfs_ops mlog_attr_ops = {
.show = mlog_show,
.store = mlog_store,
};
@@ -144,10 +131,10 @@ static struct kobj_type mlog_ktype = {
};
static struct kset mlog_kset = {
- .kobj = {.name = "logmask", .ktype = &mlog_ktype},
+ .kobj = {.ktype = &mlog_ktype},
};
-int mlog_sys_init(struct subsystem *o2cb_subsys)
+int mlog_sys_init(struct kset *o2cb_kset)
{
int i = 0;
@@ -157,7 +144,8 @@ int mlog_sys_init(struct subsystem *o2cb_subsys)
}
mlog_attr_ptrs[i] = NULL;
- mlog_kset.subsys = o2cb_subsys;
+ kobject_set_name(&mlog_kset.kobj, "logmask");
+ mlog_kset.kobj.kset = o2cb_kset;
return kset_register(&mlog_kset);
}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a42628ba9dd..2260fb9e650 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -48,77 +48,64 @@
* only emit the appropriage printk() when the caller passes in a constant
* mask, as is almost always the case.
*
- * All this bitmask nonsense is hidden from the /proc interface so that Joel
- * doesn't have an aneurism. Reading the file gives a straight forward
- * indication of which bits are on or off:
- * ENTRY off
- * EXIT off
+ * All this bitmask nonsense is managed from the files under
+ * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward
+ * indication of which bits are allowed (allow) or denied (off/deny).
+ * ENTRY deny
+ * EXIT deny
* TCP off
* MSG off
* SOCKET off
- * ERROR off
- * NOTICE on
+ * ERROR allow
+ * NOTICE allow
*
* Writing changes the state of a given bit and requires a strictly formatted
* single write() call:
*
- * write(fd, "ENTRY on", 8);
+ * write(fd, "allow", 5);
*
- * would turn the entry bit on. "1" is also accepted in the place of "on", and
- * "off" and "0" behave as expected.
+ * Echoing allow/deny/off string into the logmask files can flip the bits
+ * on or off as expected; here is the bash script for example:
*
- * Some trivial shell can flip all the bits on or off:
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ * echo allow >"$log_mask"/"$node"
+ * done
*
- * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
- * cat $log_mask | (
- * while read bit status; do
- * # $1 is "on" or "off", say
- * echo "$bit $1" > $log_mask
- * done
- * )
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
+ *
+ * debugfs.ocfs2 -l TCP allow
*/
/* for task_struct */
#include <linux/sched.h>
/* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
-#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
-#define ML_EXIT 0x0000000000000002ULL /* func call exit */
-#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
-#define ML_MSG 0x0000000000000008ULL /* net network messages */
-#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
-#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
-#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
-#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
-#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
-#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
-#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
-#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
-#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
-#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
-#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
-#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
-#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
-#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
-#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
-#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
-#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
-#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
-#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
-#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
-#define ML_CONN 0x0000000004000000ULL /* net connection management */
-#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
-#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
+#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG 0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */
+#define ML_CONN 0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */
+
/* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
+#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
#ifndef MLOG_MASK_PREFIX
#define MLOG_MASK_PREFIX 0
#endif
@@ -192,9 +179,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
* previous token if args expands to nothing.
*/
#define __mlog_printk(level, fmt, args...) \
- printk(level "(%u,%lu):%s:%d " fmt, current->pid, \
- __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \
- ##args)
+ printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
+ task_pid_nr(current), __mlog_cpu_guess, \
+ __PRETTY_FUNCTION__, __LINE__ , ##args)
#define mlog(mask, fmt, args...) do { \
u64 __m = MLOG_MASK_PREFIX | (mask); \
@@ -212,62 +199,11 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#define mlog_errno(st) do { \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
- _st != AOP_TRUNCATED_PAGE) \
+ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
+ _st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
} while (0)
-#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-#define mlog_entry(fmt, args...) do { \
- mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
-} while (0)
-
-#define mlog_entry_void() do { \
- mlog(ML_ENTRY, "ENTRY:\n"); \
-} while (0)
-
-/*
- * We disable this for sparse.
- */
-#if !defined(__CHECKER__)
-#define mlog_exit(st) do { \
- if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
- mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), signed long)) \
- mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
- || __builtin_types_compatible_p(typeof(st), unsigned short) \
- || __builtin_types_compatible_p(typeof(st), unsigned char)) \
- mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), signed int) \
- || __builtin_types_compatible_p(typeof(st), signed short) \
- || __builtin_types_compatible_p(typeof(st), signed char)) \
- mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), long long)) \
- mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
- else \
- mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
-} while (0)
-#else
-#define mlog_exit(st) do { \
- mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
-} while (0)
-#endif
-
-#define mlog_exit_ptr(ptr) do { \
- mlog(ML_EXIT, "EXIT: %p\n", ptr); \
-} while (0)
-
-#define mlog_exit_void() do { \
- mlog(ML_EXIT, "EXIT\n"); \
-} while (0)
-#else
-#define mlog_entry(...) do { } while (0)
-#define mlog_entry_void(...) do { } while (0)
-#define mlog_exit(...) do { } while (0)
-#define mlog_exit_ptr(...) do { } while (0)
-#define mlog_exit_void(...) do { } while (0)
-#endif /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
-
#define mlog_bug_on_msg(cond, fmt, args...) do { \
if (cond) { \
mlog(ML_ERROR, "bug expression: " #cond "\n"); \
@@ -278,7 +214,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#include <linux/kobject.h>
#include <linux/sysfs.h>
-int mlog_sys_init(struct subsystem *o2cb_subsys);
+int mlog_sys_init(struct kset *o2cb_subsys);
void mlog_sys_shutdown(void);
#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 00000000000..73ba81928bc
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,579 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+
+#include "tcp_internal.h"
+
+#define O2NET_DEBUG_DIR "o2net"
+#define SC_DEBUG_NAME "sock_containers"
+#define NST_DEBUG_NAME "send_tracking"
+#define STATS_DEBUG_NAME "stats"
+#define NODES_DEBUG_NAME "connected_nodes"
+
+#define SHOW_SOCK_CONTAINERS 0
+#define SHOW_SOCK_STATS 1
+
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
+
+static DEFINE_SPINLOCK(o2net_debug_lock);
+
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+ spin_lock(&o2net_debug_lock);
+ list_add(&nst->st_net_debug_item, &send_tracking);
+ spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+ spin_lock(&o2net_debug_lock);
+ if (!list_empty(&nst->st_net_debug_item))
+ list_del_init(&nst->st_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_send_tracking
+ *next_nst(struct o2net_send_tracking *nst_start)
+{
+ struct o2net_send_tracking *nst, *ret = NULL;
+
+ assert_spin_locked(&o2net_debug_lock);
+
+ list_for_each_entry(nst, &nst_start->st_net_debug_item,
+ st_net_debug_item) {
+ /* discover the head of the list */
+ if (&nst->st_net_debug_item == &send_tracking)
+ break;
+
+ /* use st_task to detect real nsts in the list */
+ if (nst->st_task != NULL) {
+ ret = nst;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ nst = next_nst(dummy_nst);
+ spin_unlock(&o2net_debug_lock);
+
+ return nst;
+}
+
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+ spin_lock(&o2net_debug_lock);
+ nst = next_nst(dummy_nst);
+ list_del_init(&dummy_nst->st_net_debug_item);
+ if (nst)
+ list_add(&dummy_nst->st_net_debug_item,
+ &nst->st_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+
+ return nst; /* unused, just needs to be null when done */
+}
+
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+ struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+ ktime_t now;
+ s64 sock, send, status;
+
+ spin_lock(&o2net_debug_lock);
+ nst = next_nst(dummy_nst);
+ if (!nst)
+ goto out;
+
+ now = ktime_get();
+ sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+ send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+ status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+ /* get_task_comm isn't exported. oh well. */
+ seq_printf(seq, "%p:\n"
+ " pid: %lu\n"
+ " tgid: %lu\n"
+ " process name: %s\n"
+ " node: %u\n"
+ " sc: %p\n"
+ " message id: %d\n"
+ " message type: %u\n"
+ " message key: 0x%08x\n"
+ " sock acquiry: %lld usecs ago\n"
+ " send start: %lld usecs ago\n"
+ " wait start: %lld usecs ago\n",
+ nst, (unsigned long)task_pid_nr(nst->st_task),
+ (unsigned long)nst->st_task->tgid,
+ nst->st_task->comm, nst->st_node,
+ nst->st_sc, nst->st_id, nst->st_msg_type,
+ nst->st_msg_key,
+ (long long)sock,
+ (long long)send,
+ (long long)status);
+
+out:
+ spin_unlock(&o2net_debug_lock);
+
+ return 0;
+}
+
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations nst_seq_ops = {
+ .start = nst_seq_start,
+ .next = nst_seq_next,
+ .stop = nst_seq_stop,
+ .show = nst_seq_show,
+};
+
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_send_tracking *dummy_nst;
+ struct seq_file *seq;
+ int ret;
+
+ dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+ if (dummy_nst == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dummy_nst->st_task = NULL;
+
+ ret = seq_open(file, &nst_seq_ops);
+ if (ret)
+ goto out;
+
+ seq = file->private_data;
+ seq->private = dummy_nst;
+ o2net_debug_add_nst(dummy_nst);
+
+ dummy_nst = NULL;
+
+out:
+ kfree(dummy_nst);
+ return ret;
+}
+
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct o2net_send_tracking *dummy_nst = seq->private;
+
+ o2net_debug_del_nst(dummy_nst);
+ return seq_release_private(inode, file);
+}
+
+static const struct file_operations nst_seq_fops = {
+ .open = nst_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = nst_fop_release,
+};
+
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+ spin_lock(&o2net_debug_lock);
+ list_add(&sc->sc_net_debug_item, &sock_containers);
+ spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+ spin_lock(&o2net_debug_lock);
+ list_del_init(&sc->sc_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+}
+
+struct o2net_sock_debug {
+ int dbg_ctxt;
+ struct o2net_sock_container *dbg_sock;
+};
+
+static struct o2net_sock_container
+ *next_sc(struct o2net_sock_container *sc_start)
+{
+ struct o2net_sock_container *sc, *ret = NULL;
+
+ assert_spin_locked(&o2net_debug_lock);
+
+ list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+ sc_net_debug_item) {
+ /* discover the head of the list miscast as a sc */
+ if (&sc->sc_net_debug_item == &sock_containers)
+ break;
+
+ /* use sc_page to detect real scs in the list */
+ if (sc->sc_page != NULL) {
+ ret = sc;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+ spin_lock(&o2net_debug_lock);
+ sc = next_sc(dummy_sc);
+ spin_unlock(&o2net_debug_lock);
+
+ return sc;
+}
+
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+ spin_lock(&o2net_debug_lock);
+ sc = next_sc(dummy_sc);
+ list_del_init(&dummy_sc->sc_net_debug_item);
+ if (sc)
+ list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+ spin_unlock(&o2net_debug_lock);
+
+ return sc; /* unused, just needs to be null when done */
+}
+
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s) ((_s)->sc_send_count)
+# define sc_recv_count(_s) ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s) (0U)
+# define sc_recv_count(_s) (0U)
+# define sc_tv_acquiry_total_ns(_s) (0LL)
+# define sc_tv_send_total_ns(_s) (0LL)
+# define sc_tv_status_total_ns(_s) (0LL)
+# define sc_tv_process_total_ns(_s) (0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION 1
+static void sc_show_sock_stats(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ if (!sc)
+ return;
+
+ seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+ sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+ (long long)sc_tv_acquiry_total_ns(sc),
+ (long long)sc_tv_send_total_ns(sc),
+ (long long)sc_tv_status_total_ns(sc),
+ (unsigned long)sc_recv_count(sc),
+ (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ struct inet_sock *inet = NULL;
+ __be32 saddr = 0, daddr = 0;
+ __be16 sport = 0, dport = 0;
+
+ if (!sc)
+ return;
+
+ if (sc->sc_sock) {
+ inet = inet_sk(sc->sc_sock->sk);
+ /* the stack's structs aren't sparse endian clean */
+ saddr = (__force __be32)inet->inet_saddr;
+ daddr = (__force __be32)inet->inet_daddr;
+ sport = (__force __be16)inet->inet_sport;
+ dport = (__force __be16)inet->inet_dport;
+ }
+
+ /* XXX sigh, inet-> doesn't have sparse annotation so any
+ * use of it here generates a warning with -Wbitwise */
+ seq_printf(seq, "%p:\n"
+ " krefs: %d\n"
+ " sock: %pI4:%u -> "
+ "%pI4:%u\n"
+ " remote node: %s\n"
+ " page off: %zu\n"
+ " handshake ok: %u\n"
+ " timer: %lld usecs\n"
+ " data ready: %lld usecs\n"
+ " advance start: %lld usecs\n"
+ " advance stop: %lld usecs\n"
+ " func start: %lld usecs\n"
+ " func stop: %lld usecs\n"
+ " func key: 0x%08x\n"
+ " func type: %u\n",
+ sc,
+ atomic_read(&sc->sc_kref.refcount),
+ &saddr, inet ? ntohs(sport) : 0,
+ &daddr, inet ? ntohs(dport) : 0,
+ sc->sc_node->nd_name,
+ sc->sc_page_off,
+ sc->sc_handshake_ok,
+ (long long)ktime_to_us(sc->sc_tv_timer),
+ (long long)ktime_to_us(sc->sc_tv_data_ready),
+ (long long)ktime_to_us(sc->sc_tv_advance_start),
+ (long long)ktime_to_us(sc->sc_tv_advance_stop),
+ (long long)ktime_to_us(sc->sc_tv_func_start),
+ (long long)ktime_to_us(sc->sc_tv_func_stop),
+ sc->sc_msg_key,
+ sc->sc_msg_type);
+}
+
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
+
+ spin_lock(&o2net_debug_lock);
+ sc = next_sc(dummy_sc);
+
+ if (sc) {
+ if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+ sc_show_sock_container(seq, sc);
+ else
+ sc_show_sock_stats(seq, sc);
+ }
+
+ spin_unlock(&o2net_debug_lock);
+
+ return 0;
+}
+
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations sc_seq_ops = {
+ .start = sc_seq_start,
+ .next = sc_seq_next,
+ .stop = sc_seq_stop,
+ .show = sc_seq_show,
+};
+
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
+{
+ struct o2net_sock_container *dummy_sc;
+ struct seq_file *seq;
+ int ret;
+
+ dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+ if (dummy_sc == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dummy_sc->sc_page = NULL;
+
+ ret = seq_open(file, &sc_seq_ops);
+ if (ret)
+ goto out;
+
+ seq = file->private_data;
+ seq->private = sd;
+ sd->dbg_sock = dummy_sc;
+ o2net_debug_add_sc(dummy_sc);
+
+ dummy_sc = NULL;
+
+out:
+ kfree(dummy_sc);
+ return ret;
+}
+
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *dummy_sc = sd->dbg_sock;
+
+ o2net_debug_del_sc(dummy_sc);
+ return seq_release_private(inode, file);
+}
+
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_STATS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
+static const struct file_operations stats_seq_fops = {
+ .open = stats_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
+static const struct file_operations sc_seq_fops = {
+ .open = sc_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sc_fop_release,
+};
+
+static int o2net_fill_bitmap(char *buf, int len)
+{
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int i = -1, out = 0;
+
+ o2net_fill_node_map(map, sizeof(map));
+
+ while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+ return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+ file->private_data = buf;
+
+ return 0;
+}
+
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+ .open = nodes_fop_open,
+ .release = o2net_debug_release,
+ .read = o2net_debug_read,
+ .llseek = generic_file_llseek,
+};
+
+void o2net_debugfs_exit(void)
+{
+ debugfs_remove(nodes_dentry);
+ debugfs_remove(stats_dentry);
+ debugfs_remove(sc_dentry);
+ debugfs_remove(nst_dentry);
+ debugfs_remove(o2net_dentry);
+}
+
+int o2net_debugfs_init(void)
+{
+ umode_t mode = S_IFREG|S_IRUSR;
+
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+ if (o2net_dentry)
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nst_seq_fops);
+ if (nst_dentry)
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &sc_seq_fops);
+ if (sc_dentry)
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &stats_seq_fops);
+ if (stats_dentry)
+ nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nodes_fops);
+ if (nodes_dentry)
+ return 0;
+
+ o2net_debugfs_exit();
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+}
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index e1fceb8aa32..441c84e169e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,93 +19,25 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/sysctl.h>
#include <linux/configfs.h>
-#include "endian.h"
#include "tcp.h"
#include "nodemanager.h"
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
-#include "ver.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
* cluster references throughout where nodes are looked up */
-static struct o2nm_cluster *o2nm_single_cluster = NULL;
-
-#define OCFS2_MAX_HB_CTL_PATH 256
-static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
-
-static ctl_table ocfs2_nm_table[] = {
- {
- .ctl_name = 1,
- .procname = "hb_ctl_path",
- .data = ocfs2_hb_ctl_path,
- .maxlen = OCFS2_MAX_HB_CTL_PATH,
- .mode = 0644,
- .proc_handler = &proc_dostring,
- .strategy = &sysctl_string,
- },
- { .ctl_name = 0 }
-};
-
-static ctl_table ocfs2_mod_table[] = {
- {
- .ctl_name = KERN_OCFS2_NM,
- .procname = "nm",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_nm_table
- },
- { .ctl_name = 0}
-};
+struct o2nm_cluster *o2nm_single_cluster = NULL;
-static ctl_table ocfs2_kern_table[] = {
- {
- .ctl_name = KERN_OCFS2,
- .procname = "ocfs2",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_mod_table
- },
- { .ctl_name = 0}
-};
-
-static ctl_table ocfs2_root_table[] = {
- {
- .ctl_name = CTL_FS,
- .procname = "fs",
- .data = NULL,
- .maxlen = 0,
- .mode = 0555,
- .child = ocfs2_kern_table
- },
- { .ctl_name = 0 }
-};
-
-static struct ctl_table_header *ocfs2_table_header = NULL;
-
-const char *o2nm_get_hb_ctl_path(void)
-{
- return ocfs2_hb_ctl_path;
-}
-EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
-
-struct o2nm_cluster {
- struct config_group cl_group;
- unsigned cl_has_local:1;
- u8 cl_local_node;
- rwlock_t cl_nodes_lock;
- struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
- struct rb_root cl_node_ip_tree;
- /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
- unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+ "reset", /* O2NM_FENCE_RESET */
+ "panic", /* O2NM_FENCE_PANIC */
};
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
@@ -152,14 +84,16 @@ static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
struct o2nm_node *node, *ret = NULL;
while (*p) {
+ int cmp;
+
parent = *p;
node = rb_entry(parent, struct o2nm_node, nd_ip_node);
- if (memcmp(&ip_needle, &node->nd_ipv4_address,
- sizeof(ip_needle)) < 0)
+ cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
+ sizeof(ip_needle));
+ if (cmp < 0)
p = &(*p)->rb_left;
- else if (memcmp(&ip_needle, &node->nd_ipv4_address,
- sizeof(ip_needle)) > 0)
+ else if (cmp > 0)
p = &(*p)->rb_right;
else {
ret = node;
@@ -320,7 +254,7 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
{
- return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
+ return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
}
static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
@@ -541,30 +475,244 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
}
#endif
+struct o2nm_cluster_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct o2nm_cluster *, char *);
+ ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
+};
+
+static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
+ unsigned int *val)
+{
+ unsigned long tmp;
+ char *p = (char *)page;
+
+ tmp = simple_strtoul(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
+
+ if (tmp == 0)
+ return -EINVAL;
+ if (tmp >= (u32)-1)
+ return -ERANGE;
+
+ *val = tmp;
+
+ return count;
+}
+
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
+ struct o2nm_cluster *cluster, char *page)
+{
+ return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+}
+
+static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
+ struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+ ssize_t ret;
+ unsigned int val;
+
+ ret = o2nm_cluster_attr_write(page, count, &val);
+
+ if (ret > 0) {
+ if (cluster->cl_idle_timeout_ms != val
+ && o2net_num_connected_peers()) {
+ mlog(ML_NOTICE,
+ "o2net: cannot change idle timeout after "
+ "the first peer has agreed to it."
+ " %d connected peers\n",
+ o2net_num_connected_peers());
+ ret = -EINVAL;
+ } else if (val <= cluster->cl_keepalive_delay_ms) {
+ mlog(ML_NOTICE, "o2net: idle timeout must be larger "
+ "than keepalive delay\n");
+ ret = -EINVAL;
+ } else {
+ cluster->cl_idle_timeout_ms = val;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
+ struct o2nm_cluster *cluster, char *page)
+{
+ return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
+ struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+ ssize_t ret;
+ unsigned int val;
+
+ ret = o2nm_cluster_attr_write(page, count, &val);
+
+ if (ret > 0) {
+ if (cluster->cl_keepalive_delay_ms != val
+ && o2net_num_connected_peers()) {
+ mlog(ML_NOTICE,
+ "o2net: cannot change keepalive delay after"
+ " the first peer has agreed to it."
+ " %d connected peers\n",
+ o2net_num_connected_peers());
+ ret = -EINVAL;
+ } else if (val >= cluster->cl_idle_timeout_ms) {
+ mlog(ML_NOTICE, "o2net: keepalive delay must be "
+ "smaller than idle timeout\n");
+ ret = -EINVAL;
+ } else {
+ cluster->cl_keepalive_delay_ms = val;
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
+ struct o2nm_cluster *cluster, char *page)
+{
+ return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
+ struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+ return o2nm_cluster_attr_write(page, count,
+ &cluster->cl_reconnect_delay_ms);
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_read(
+ struct o2nm_cluster *cluster, char *page)
+{
+ ssize_t ret = 0;
+
+ if (cluster)
+ ret = sprintf(page, "%s\n",
+ o2nm_fence_method_desc[cluster->cl_fence_method]);
+ return ret;
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_write(
+ struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+ unsigned int i;
+
+ if (page[count - 1] != '\n')
+ goto bail;
+
+ for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
+ if (count != strlen(o2nm_fence_method_desc[i]) + 1)
+ continue;
+ if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
+ continue;
+ if (cluster->cl_fence_method != i) {
+ printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
+ o2nm_fence_method_desc[i]);
+ cluster->cl_fence_method = i;
+ }
+ return count;
+ }
+
+bail:
+ return -EINVAL;
+}
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "idle_timeout_ms",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2nm_cluster_attr_idle_timeout_ms_read,
+ .store = o2nm_cluster_attr_idle_timeout_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "keepalive_delay_ms",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2nm_cluster_attr_keepalive_delay_ms_read,
+ .store = o2nm_cluster_attr_keepalive_delay_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "reconnect_delay_ms",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2nm_cluster_attr_reconnect_delay_ms_read,
+ .store = o2nm_cluster_attr_reconnect_delay_ms_write,
+};
+
+static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "fence_method",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2nm_cluster_attr_fence_method_read,
+ .store = o2nm_cluster_attr_fence_method_write,
+};
+
+static struct configfs_attribute *o2nm_cluster_attrs[] = {
+ &o2nm_cluster_attr_idle_timeout_ms.attr,
+ &o2nm_cluster_attr_keepalive_delay_ms.attr,
+ &o2nm_cluster_attr_reconnect_delay_ms.attr,
+ &o2nm_cluster_attr_fence_method.attr,
+ NULL,
+};
+static ssize_t o2nm_cluster_show(struct config_item *item,
+ struct configfs_attribute *attr,
+ char *page)
+{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+ struct o2nm_cluster_attribute *o2nm_cluster_attr =
+ container_of(attr, struct o2nm_cluster_attribute, attr);
+ ssize_t ret = 0;
+
+ if (o2nm_cluster_attr->show)
+ ret = o2nm_cluster_attr->show(cluster, page);
+ return ret;
+}
+
+static ssize_t o2nm_cluster_store(struct config_item *item,
+ struct configfs_attribute *attr,
+ const char *page, size_t count)
+{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
+ struct o2nm_cluster_attribute *o2nm_cluster_attr =
+ container_of(attr, struct o2nm_cluster_attribute, attr);
+ ssize_t ret;
+
+ if (o2nm_cluster_attr->store == NULL) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = o2nm_cluster_attr->store(cluster, page, count);
+ if (ret < count)
+ goto out;
+out:
+ return ret;
+}
+
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
{
struct o2nm_node *node = NULL;
- struct config_item *ret = NULL;
if (strlen(name) > O2NM_MAX_NAME_LEN)
- goto out; /* ENAMETOOLONG */
+ return ERR_PTR(-ENAMETOOLONG);
- node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
+ node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
if (node == NULL)
- goto out; /* ENOMEM */
+ return ERR_PTR(-ENOMEM);
strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
- ret = &node->nd_item;
+ mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
-out:
- if (ret == NULL)
- kfree(node);
-
- return ret;
+ return &node->nd_item;
}
static void o2nm_node_group_drop_item(struct config_group *group,
@@ -597,6 +745,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
}
write_unlock(&cluster->cl_nodes_lock);
+ mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+ config_item_name(&node->nd_item));
+
config_item_put(item);
}
@@ -622,10 +773,13 @@ static void o2nm_cluster_release(struct config_item *item)
static struct configfs_item_operations o2nm_cluster_item_ops = {
.release = o2nm_cluster_release,
+ .show_attribute = o2nm_cluster_show,
+ .store_attribute = o2nm_cluster_store,
};
static struct config_item_type o2nm_cluster_type = {
.ct_item_ops = &o2nm_cluster_item_ops,
+ .ct_attrs = o2nm_cluster_attrs,
.ct_owner = THIS_MODULE,
};
@@ -656,10 +810,10 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
/* this runs under the parent dir's i_mutex; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
- goto out; /* ENOSPC */
+ return ERR_PTR(-ENOSPC);
- cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
- ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
+ cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
+ ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
o2hb_group = o2hb_alloc_hb_set();
if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
@@ -676,6 +830,10 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
cluster->cl_group.default_groups[2] = NULL;
rwlock_init(&cluster->cl_nodes_lock);
cluster->cl_node_ip_tree = RB_ROOT;
+ cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
+ cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
+ cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
+ cluster->cl_fence_method = O2NM_FENCE_RESET;
ret = &cluster->cl_group;
o2nm_single_cluster = cluster;
@@ -686,6 +844,7 @@ out:
kfree(ns);
o2hb_free_hb_set(o2hb_group);
kfree(defs);
+ ret = ERR_PTR(-ENOMEM);
}
return ret;
@@ -730,41 +889,75 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
},
};
-static void __exit exit_o2nm(void)
+int o2nm_depend_item(struct config_item *item)
{
- if (ocfs2_table_header)
- unregister_sysctl_table(ocfs2_table_header);
+ return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+void o2nm_undepend_item(struct config_item *item)
+{
+ configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+int o2nm_depend_this_node(void)
+{
+ int ret = 0;
+ struct o2nm_node *local_node;
+
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ if (!local_node) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = o2nm_depend_item(&local_node->nd_item);
+ o2nm_node_put(local_node);
+
+out:
+ return ret;
+}
+
+void o2nm_undepend_this_node(void)
+{
+ struct o2nm_node *local_node;
+
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ BUG_ON(!local_node);
+
+ o2nm_undepend_item(&local_node->nd_item);
+ o2nm_node_put(local_node);
+}
+
+
+static void __exit exit_o2nm(void)
+{
/* XXX sync with hb callbacks and shut down hb? */
o2net_unregister_hb_callbacks();
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
o2cb_sys_shutdown();
o2net_exit();
+ o2hb_exit();
}
static int __init init_o2nm(void)
{
int ret = -1;
- cluster_print_version();
-
- o2hb_init();
- o2net_init();
+ ret = o2hb_init();
+ if (ret)
+ goto out;
- ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
- if (!ocfs2_table_header) {
- printk(KERN_ERR "nodemanager: unable to register sysctl\n");
- ret = -ENOMEM; /* or something. */
- goto out_o2net;
- }
+ ret = o2net_init();
+ if (ret)
+ goto out_o2hb;
ret = o2net_register_hb_callbacks();
if (ret)
- goto out_sysctl;
+ goto out_o2net;
config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
- init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+ mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
if (ret) {
printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
@@ -778,16 +971,17 @@ static int __init init_o2nm(void)
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
out_callbacks:
o2net_unregister_hb_callbacks();
-out_sysctl:
- unregister_sysctl_table(ocfs2_table_header);
out_o2net:
o2net_exit();
+out_o2hb:
+ o2hb_exit();
out:
return ret;
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index fce8033c310..09ea2d388bb 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,10 +33,11 @@
#include <linux/configfs.h>
#include <linux/rbtree.h>
-#define KERN_OCFS2 988
-#define KERN_OCFS2_NM 1
-
-const char *o2nm_get_hb_ctl_path(void);
+enum o2nm_fence_method {
+ O2NM_FENCE_RESET = 0,
+ O2NM_FENCE_PANIC,
+ O2NM_FENCE_METHODS, /* Number of fence methods */
+};
struct o2nm_node {
spinlock_t nd_lock;
@@ -53,6 +54,24 @@ struct o2nm_node {
unsigned long nd_set_attributes;
};
+struct o2nm_cluster {
+ struct config_group cl_group;
+ unsigned cl_has_local:1;
+ u8 cl_local_node;
+ rwlock_t cl_nodes_lock;
+ struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
+ struct rb_root cl_node_ip_tree;
+ unsigned int cl_idle_timeout_ms;
+ unsigned int cl_keepalive_delay_ms;
+ unsigned int cl_reconnect_delay_ms;
+ enum o2nm_fence_method cl_fence_method;
+
+ /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
+ unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+};
+
+extern struct o2nm_cluster *o2nm_single_cluster;
+
u8 o2nm_this_node(void);
int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
@@ -61,4 +80,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
void o2nm_node_get(struct o2nm_node *node);
void o2nm_node_put(struct o2nm_node *node);
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
+
#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad57..49b594325be 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
/* host name, group name, cluster name all 64 bytes */
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION** Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS 32
+
#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 7bba98fbfc1..1ec141e758d 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,8 +44,8 @@
* and if they're the last, they fire off the decision.
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/reboot.h>
#include "heartbeat.h"
#include "nodemanager.h"
@@ -72,10 +72,24 @@ static void o2quo_fence_self(void)
/* panic spins with interrupts enabled. with preempt
* threads can still schedule, etc, etc */
o2hb_stop_all_regions();
- panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+
+ switch (o2nm_single_cluster->cl_fence_method) {
+ case O2NM_FENCE_PANIC:
+ panic("*** ocfs2 is very sorry to be fencing this system by "
+ "panicing ***\n");
+ break;
+ default:
+ WARN_ON(o2nm_single_cluster->cl_fence_method >=
+ O2NM_FENCE_METHODS);
+ case O2NM_FENCE_RESET:
+ printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
+ "system by restarting ***\n");
+ emergency_restart();
+ break;
+ };
}
-/* Indicate that a timeout occured on a hearbeat region write. The
+/* Indicate that a timeout occurred on a hearbeat region write. The
* other nodes in the cluster may consider us dead at that time so we
* want to "fence" ourselves so that we don't scribble on the disk
* after they think they've recovered us. This can't solve all
@@ -88,7 +102,7 @@ void o2quo_disk_timeout(void)
o2quo_fence_self();
}
-static void o2quo_make_decision(void *arg)
+static void o2quo_make_decision(struct work_struct *work)
{
int quorum;
int lowest_hb, lowest_reachable = 0, fence = 0;
@@ -247,10 +261,10 @@ void o2quo_hb_still_up(u8 node)
spin_unlock(&qs->qs_lock);
}
-/* This is analagous to hb_up. as a node's connection comes up we delay the
+/* This is analogous to hb_up. as a node's connection comes up we delay the
* quorum decision until we see it heartbeating. the hold will be droped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
* */
void o2quo_conn_up(u8 node)
{
@@ -306,10 +320,12 @@ void o2quo_init(void)
struct o2quo_state *qs = &o2quo_state;
spin_lock_init(&qs->qs_lock);
- INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+ INIT_WORK(&qs->qs_work, o2quo_make_decision);
}
void o2quo_exit(void)
{
- flush_scheduled_work();
+ struct o2quo_state *qs = &o2quo_state;
+
+ flush_work(&qs->qs_work);
}
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 1d9f6acafa2..b7f57271d49 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -28,97 +28,55 @@
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/fs.h>
#include "ocfs2_nodemanager.h"
#include "masklog.h"
#include "sys.h"
-struct o2cb_attribute {
- struct attribute attr;
- ssize_t (*show)(char *buf);
- ssize_t (*store)(const char *buf, size_t count);
-};
-
-#define O2CB_ATTR(_name, _mode, _show, _store) \
-struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
-
-#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
-#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
-static ssize_t o2cb_interface_revision_show(char *buf)
+static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
-
-static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
+static struct kobj_attribute attr_version =
+ __ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
- &o2cb_attr_interface_revision.attr,
+ &attr_version.attr,
NULL,
};
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
- const char * buffer, size_t count);
-static struct sysfs_ops o2cb_sysfs_ops = {
- .show = o2cb_show,
- .store = o2cb_store,
+static struct attribute_group o2cb_attr_group = {
+ .attrs = o2cb_attrs,
};
-static struct kobj_type o2cb_subsys_type = {
- .default_attrs = o2cb_attrs,
- .sysfs_ops = &o2cb_sysfs_ops,
-};
-
-/* gives us o2cb_subsys */
-static decl_subsys(o2cb, NULL, NULL);
-
-static ssize_t
-o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
-{
- struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
- struct subsystem *sbs = to_o2cb_subsys(kobj);
-
- BUG_ON(sbs != &o2cb_subsys);
-
- if (o2cb_attr->show)
- return o2cb_attr->show(buffer);
- return -EIO;
-}
-
-static ssize_t
-o2cb_store(struct kobject * kobj, struct attribute * attr,
- const char * buffer, size_t count)
-{
- struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
- struct subsystem *sbs = to_o2cb_subsys(kobj);
-
- BUG_ON(sbs != &o2cb_subsys);
-
- if (o2cb_attr->store)
- return o2cb_attr->store(buffer, count);
- return -EIO;
-}
+static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
- subsystem_unregister(&o2cb_subsys);
+ kset_unregister(o2cb_kset);
}
int o2cb_sys_init(void)
{
int ret;
- o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
- ret = subsystem_register(&o2cb_subsys);
+ o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
+ if (!o2cb_kset)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
- return ret;
+ goto error;
- ret = mlog_sys_init(&o2cb_subsys);
+ ret = mlog_sys_init(o2cb_kset);
if (ret)
- subsystem_unregister(&o2cb_subsys);
+ goto error;
+ return 0;
+error:
+ kset_unregister(o2cb_kset);
return ret;
}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b650efa8c8b..681691bc233 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -58,6 +58,8 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/kref.h>
+#include <linux/net.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <asm/uaccess.h>
@@ -71,17 +73,9 @@
#include "tcp_internal.h"
-/*
- * The linux network stack isn't sparse endian clean.. It has macros like
- * ntohs() which perform the endian checks and structs like sockaddr_in
- * which aren't annotated. So __force is found here to get the build
- * clean. When they emerge from the dark ages and annotate the code
- * we can remove these.
- */
-
-#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
- NIPQUAD(sc->sc_node->nd_ipv4_address), \
+ &sc->sc_node->nd_ipv4_address, \
ntohs(sc->sc_node->nd_ipv4_port)
/*
@@ -114,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
/* XXX someday we'll need better accounting */
-static struct socket *o2net_listen_sock = NULL;
+static struct socket *o2net_listen_sock;
/*
* listen work is only queued by the listening socket callbacks on the
@@ -140,13 +134,148 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
[O2NET_ERR_DIED] = -EHOSTDOWN,};
/* can't quite avoid *all* internal declarations :/ */
-static void o2net_sc_connect_completed(void *arg);
-static void o2net_rx_until_empty(void *arg);
-static void o2net_shutdown_sc(void *arg);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
-static void o2net_sc_send_keep_req(void *arg);
+static void o2net_sc_connect_completed(struct work_struct *work);
+static void o2net_rx_until_empty(struct work_struct *work);
+static void o2net_shutdown_sc(struct work_struct *work);
+static void o2net_listen_data_ready(struct sock *sk);
+static void o2net_sc_send_keep_req(struct work_struct *work);
static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
+static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
+
+#ifdef CONFIG_DEBUG_FS
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node)
+{
+ INIT_LIST_HEAD(&nst->st_net_debug_item);
+ nst->st_task = task;
+ nst->st_msg_type = msgtype;
+ nst->st_msg_key = msgkey;
+ nst->st_node = node;
+}
+
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+ nst->st_sock_time = ktime_get();
+}
+
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+ nst->st_send_time = ktime_get();
+}
+
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+ nst->st_status_time = ktime_get();
+}
+
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+ nst->st_sc = sc;
+}
+
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+ u32 msg_id)
+{
+ nst->st_id = msg_id;
+}
+
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_timer = ktime_get();
+}
+
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_data_ready = ktime_get();
+}
+
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_advance_start = ktime_get();
+}
+
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_advance_stop = ktime_get();
+}
+
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_func_start = ktime_get();
+}
+
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_func_stop = ktime_get();
+}
+
+#else /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+ return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+ sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+ ktime_sub(ktime_get(),
+ nst->st_status_time));
+ sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+ ktime_sub(nst->st_status_time,
+ nst->st_send_time));
+ sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+ ktime_sub(nst->st_send_time,
+ nst->st_sock_time));
+ sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+ o2net_get_func_run_time(sc));
+ sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
+static inline unsigned int o2net_reconnect_delay(void)
+{
+ return o2nm_single_cluster->cl_reconnect_delay_ms;
+}
+
+static inline unsigned int o2net_keepalive_delay(void)
+{
+ return o2nm_single_cluster->cl_keepalive_delay_ms;
+}
+
+static inline unsigned int o2net_idle_timeout(void)
+{
+ return o2nm_single_cluster->cl_idle_timeout_ms;
+}
static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
{
@@ -175,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn)
static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
{
- int ret = 0;
-
- do {
- if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
- ret = -EAGAIN;
- break;
- }
- spin_lock(&nn->nn_lock);
- ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
- if (ret == 0)
- list_add_tail(&nsw->ns_node_item,
- &nn->nn_status_list);
- spin_unlock(&nn->nn_lock);
- } while (ret == -EAGAIN);
+ int ret;
- if (ret == 0) {
- init_waitqueue_head(&nsw->ns_wq);
- nsw->ns_sys_status = O2NET_ERR_NONE;
- nsw->ns_status = 0;
+ spin_lock(&nn->nn_lock);
+ ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ nsw->ns_id = ret;
+ list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
}
+ spin_unlock(&nn->nn_lock);
+ if (ret < 0)
+ return ret;
- return ret;
+ init_waitqueue_head(&nsw->ns_wq);
+ nsw->ns_sys_status = O2NET_ERR_NONE;
+ nsw->ns_status = 0;
+ return 0;
}
static void o2net_complete_nsw_locked(struct o2net_node *nn,
@@ -239,14 +362,12 @@ out:
static void o2net_complete_nodes_nsw(struct o2net_node *nn)
{
- struct list_head *iter, *tmp;
+ struct o2net_status_wait *nsw, *tmp;
unsigned int num_kills = 0;
- struct o2net_status_wait *nsw;
assert_spin_locked(&nn->nn_lock);
- list_for_each_safe(iter, tmp, &nn->nn_status_list) {
- nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+ list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
num_kills++;
}
@@ -271,6 +392,8 @@ static void sc_kref_release(struct kref *kref)
{
struct o2net_sock_container *sc = container_of(kref,
struct o2net_sock_container, sc_kref);
+ BUG_ON(timer_pending(&sc->sc_idle_timeout));
+
sclog(sc, "releasing\n");
if (sc->sc_sock) {
@@ -278,9 +401,14 @@ static void sc_kref_release(struct kref *kref)
sc->sc_sock = NULL;
}
+ o2nm_undepend_item(&sc->sc_node->nd_item);
o2nm_node_put(sc->sc_node);
sc->sc_node = NULL;
+ o2net_debug_del_sc(sc);
+
+ if (sc->sc_page)
+ __free_page(sc->sc_page);
kfree(sc);
}
@@ -298,9 +426,10 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
{
struct o2net_sock_container *sc, *ret = NULL;
struct page *page = NULL;
+ int status = 0;
page = alloc_page(GFP_NOFS);
- sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
+ sc = kzalloc(sizeof(*sc), GFP_NOFS);
if (sc == NULL || page == NULL)
goto out;
@@ -308,10 +437,17 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
o2nm_node_get(node);
sc->sc_node = node;
- INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
- INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
- INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
- INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
+ /* pin the node item of the remote node */
+ status = o2nm_depend_item(&node->nd_item);
+ if (status) {
+ mlog_errno(status);
+ o2nm_node_put(node);
+ goto out;
+ }
+ INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
+ INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
+ INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
+ INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
init_timer(&sc->sc_idle_timeout);
sc->sc_idle_timeout.function = o2net_idle_timer;
@@ -321,6 +457,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
ret = sc;
sc->sc_page = page;
+ o2net_debug_add_sc(sc);
sc = NULL;
page = NULL;
@@ -342,7 +479,7 @@ static void o2net_sc_queue_work(struct o2net_sock_container *sc,
sc_put(sc);
}
static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
- struct work_struct *work,
+ struct delayed_work *work,
int delay)
{
sc_get(sc);
@@ -350,12 +487,19 @@ static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
sc_put(sc);
}
static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
- struct work_struct *work)
+ struct delayed_work *work)
{
if (cancel_delayed_work(work))
sc_put(sc);
}
+static atomic_t o2net_connected_peers = ATOMIC_INIT(0);
+
+int o2net_num_connected_peers(void)
+{
+ return atomic_read(&o2net_connected_peers);
+}
+
static void o2net_set_nn_state(struct o2net_node *nn,
struct o2net_sock_container *sc,
unsigned valid, int err)
@@ -366,14 +510,17 @@ static void o2net_set_nn_state(struct o2net_node *nn,
assert_spin_locked(&nn->nn_lock);
+ if (old_sc && !sc)
+ atomic_dec(&o2net_connected_peers);
+ else if (!old_sc && sc)
+ atomic_inc(&o2net_connected_peers);
+
/* the node num comparison and single connect/accept path should stop
* an non-null sc from being overwritten with another */
BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
- /* we won't reconnect after our valid conn goes away for
- * this hb iteration.. here so it shows up in the logs */
if (was_valid && !valid && err == 0)
err = -ENOTCONN;
@@ -396,22 +543,18 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
if (was_valid && !valid) {
- printk(KERN_INFO "o2net: no longer connected to "
- SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+ if (old_sc)
+ printk(KERN_NOTICE "o2net: No longer connected to "
+ SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
if (!was_valid && valid) {
o2quo_conn_up(o2net_num_from_nn(nn));
- /* this is a bit of a hack. we only try reconnecting
- * when heartbeating starts until we get a connection.
- * if that connection then dies we don't try reconnecting.
- * the only way to start connecting again is to down
- * heartbeat and bring it back up. */
cancel_delayed_work(&nn->nn_connect_expired);
- printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
+ printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
- "connected to" : "accepted connection from",
+ "Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
@@ -421,15 +564,27 @@ static void o2net_set_nn_state(struct o2net_node *nn,
* the work queue actually being up. */
if (!valid && o2net_wq) {
unsigned long delay;
- /* delay if we're withing a RECONNECT_DELAY of the
+ /* delay if we're within a RECONNECT_DELAY of the
* last attempt */
delay = (nn->nn_last_connect_attempt +
- msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+ msecs_to_jiffies(o2net_reconnect_delay()))
- jiffies;
- if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
+ if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
delay = 0;
mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+
+ /*
+ * Delay the expired work after idle timeout.
+ *
+ * We might have lots of failed connection attempts that run
+ * through here but we only cancel the connect_expired work when
+ * a connection attempt succeeds. So only the first enqueue of
+ * the connect_expired work will do anything. The rest will see
+ * that it's already queued and do nothing.
+ */
+ delay += msecs_to_jiffies(o2net_idle_timeout());
+ queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
}
/* keep track of the nn's sc ref for the caller */
@@ -442,15 +597,15 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
/* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
if (sk->sk_user_data) {
struct o2net_sock_container *sc = sk->sk_user_data;
sclog(sc, "data_ready hit\n");
- do_gettimeofday(&sc->sc_tv_data_ready);
+ o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
ready = sc->sc_data_ready;
} else {
@@ -458,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
}
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
/* see o2net_register_callbacks() */
@@ -479,16 +634,19 @@ static void o2net_state_change(struct sock *sk)
state_change = sc->sc_state_change;
switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- o2net_sc_queue_work(sc, &sc->sc_connect_work);
- break;
- default:
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
- break;
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ o2net_sc_queue_work(sc, &sc->sc_connect_work);
+ break;
+ default:
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+ " shutdown, state %d\n",
+ SC_NODEF_ARGS(sc), sk->sk_state);
+ o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+ break;
}
out:
read_unlock(&sk->sk_callback_lock);
@@ -520,6 +678,8 @@ static void o2net_register_callbacks(struct sock *sk,
sk->sk_data_ready = o2net_data_ready;
sk->sk_state_change = o2net_state_change;
+ mutex_init(&sc->sc_send_lock);
+
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -564,9 +724,11 @@ static void o2net_ensure_shutdown(struct o2net_node *nn,
* ourselves as state_change couldn't get the nn_lock and call set_nn_state
* itself.
*/
-static void o2net_shutdown_sc(void *arg)
+static void o2net_shutdown_sc(struct work_struct *work)
{
- struct o2net_sock_container *sc = arg;
+ struct o2net_sock_container *sc =
+ container_of(work, struct o2net_sock_container,
+ sc_shutdown_work);
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
sclog(sc, "shutting down\n");
@@ -578,8 +740,7 @@ static void o2net_shutdown_sc(void *arg)
del_timer_sync(&sc->sc_idle_timeout);
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
sc_put(sc);
- sc->sc_sock->ops->shutdown(sc->sc_sock,
- RCV_SHUTDOWN|SEND_SHUTDOWN);
+ kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
}
/* not fatal so failed connects before the other guy has our
@@ -605,32 +766,32 @@ static struct o2net_msg_handler *
o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
struct rb_node **ret_parent)
{
- struct rb_node **p = &o2net_handler_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p = &o2net_handler_tree.rb_node;
+ struct rb_node *parent = NULL;
struct o2net_msg_handler *nmh, *ret = NULL;
int cmp;
- while (*p) {
- parent = *p;
- nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+ while (*p) {
+ parent = *p;
+ nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
cmp = o2net_handler_cmp(nmh, msg_type, key);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else {
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
ret = nmh;
- break;
+ break;
}
- }
+ }
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
+ if (ret_p != NULL)
+ *ret_p = p;
+ if (ret_parent != NULL)
+ *ret_parent = parent;
- return ret;
+ return ret;
}
static void o2net_handler_kref_release(struct kref *kref)
@@ -650,6 +811,7 @@ static void o2net_handler_put(struct o2net_msg_handler *nmh)
* be given to the handler if their payload is longer than the max. */
int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
o2net_msg_handler_func *func, void *data,
+ o2net_post_msg_handler_func *post_func,
struct list_head *unreg_list)
{
struct o2net_msg_handler *nmh = NULL;
@@ -676,7 +838,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
goto out;
}
- nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
+ nmh = kzalloc(sizeof(struct o2net_msg_handler), GFP_NOFS);
if (nmh == NULL) {
ret = -ENOMEM;
goto out;
@@ -684,6 +846,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
nmh->nh_func = func;
nmh->nh_func_data = data;
+ nmh->nh_post_func = post_func;
nmh->nh_msg_type = msg_type;
nmh->nh_max_len = max_len;
nmh->nh_key = key;
@@ -705,7 +868,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
/* we've had some trouble with handlers seemingly vanishing. */
mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
&parent) == NULL,
- "couldn't find handler we *just* registerd "
+ "couldn't find handler we *just* registered "
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
@@ -722,13 +885,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
void o2net_unregister_handler_list(struct list_head *list)
{
- struct list_head *pos, *n;
- struct o2net_msg_handler *nmh;
+ struct o2net_msg_handler *nmh, *n;
write_lock(&o2net_handler_lock);
- list_for_each_safe(pos, n, list) {
- nmh = list_entry(pos, struct o2net_msg_handler,
- nh_unregister_item);
+ list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -756,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
- int ret;
- mm_segment_t oldfs;
- struct kvec vec = {
- .iov_len = len,
- .iov_base = data,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&vec,
- .msg_flags = MSG_DONTWAIT,
- };
-
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
- set_fs(oldfs);
-
- return ret;
+ struct kvec vec = { .iov_len = len, .iov_base = data, };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+ return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- mm_segment_t oldfs;
- struct msghdr msg = {
- .msg_iov = (struct iovec *)vec,
- .msg_iovlen = veclen,
- };
+ struct msghdr msg;
if (sock == NULL) {
ret = -EINVAL;
goto out;
}
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_sendmsg(sock, &msg, total);
- set_fs(oldfs);
- if (ret != total) {
- mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
- total);
- if (ret >= 0)
- ret = -EPIPE; /* should be smarter, I bet */
- goto out;
- }
-
- ret = 0;
+ ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+ if (likely(ret == total))
+ return 0;
+ mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+ if (ret >= 0)
+ ret = -EPIPE; /* should be smarter, I bet */
out:
- if (ret < 0)
- mlog(0, "returning error: %d\n", ret);
+ mlog(0, "returning error: %d\n", ret);
return ret;
}
@@ -817,15 +950,25 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
ssize_t ret;
-
- ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
- virt_to_page(kmalloced_virt),
- (long)kmalloced_virt & ~PAGE_MASK,
- size, MSG_DONTWAIT);
- if (ret != size) {
- mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
+ while (1) {
+ mutex_lock(&sc->sc_send_lock);
+ ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
+ virt_to_page(kmalloced_virt),
+ (long)kmalloced_virt & ~PAGE_MASK,
+ size, MSG_DONTWAIT);
+ mutex_unlock(&sc->sc_send_lock);
+ if (ret == size)
+ break;
+ if (ret == (ssize_t)-EAGAIN) {
+ mlog(0, "sendpage of size %zu to " SC_NODEF_FMT
+ " returned EAGAIN\n", size, SC_NODEF_ARGS(sc));
+ cond_resched();
+ continue;
+ }
+ mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
" failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
o2net_ensure_shutdown(nn, sc, 0);
+ break;
}
}
@@ -863,10 +1006,29 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
return ret;
}
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct o2net_sock_container *sc;
+ int node, ret;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+ for (node = 0; node < O2NM_MAX_NODES; ++node) {
+ o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!ret) {
+ set_bit(node, map);
+ sc_put(sc);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
- int ret, error = 0;
+ int ret = 0;
struct o2net_msg *msg = NULL;
size_t veclen, caller_bytes = 0;
struct kvec *vec = NULL;
@@ -875,6 +1037,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
struct o2net_status_wait nsw = {
.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
};
+ struct o2net_send_tracking nst;
+
+ o2net_init_nst(&nst, msg_type, key, current, target_node);
if (o2net_wq == NULL) {
mlog(0, "attempt to tx without o2netd running\n");
@@ -900,13 +1065,16 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
goto out;
}
- ret = wait_event_interruptible(nn->nn_sc_wq,
- o2net_tx_can_proceed(nn, &sc, &error));
- if (!ret && error)
- ret = error;
+ o2net_debug_add_nst(&nst);
+
+ o2net_set_nst_sock_time(&nst);
+
+ wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
if (ret)
goto out;
+ o2net_set_nst_sock_container(&nst, sc);
+
veclen = caller_veclen + 1;
vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
if (vec == NULL) {
@@ -933,11 +1101,16 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
goto out;
msg->msg_num = cpu_to_be32(nsw.ns_id);
+ o2net_set_nst_msg_id(&nst, nsw.ns_id);
+
+ o2net_set_nst_send_time(&nst);
/* finally, convert the message header to network byte-order
* and send */
+ mutex_lock(&sc->sc_send_lock);
ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
sizeof(struct o2net_msg) + caller_bytes);
+ mutex_unlock(&sc->sc_send_lock);
msglog(msg, "sending returned %d\n", ret);
if (ret < 0) {
mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
@@ -945,8 +1118,11 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
}
/* wait on other node's handler */
+ o2net_set_nst_status_time(&nst);
wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+ o2net_update_send_stats(&nst, sc);
+
/* Note that we avoid overwriting the callers status return
* variable if a system error was reported on the other
* side. Callers beware. */
@@ -957,12 +1133,11 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
mlog(0, "woken, returning system status %d, user status %d\n",
ret, nsw.ns_status);
out:
+ o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
if (sc)
sc_put(sc);
- if (vec)
- kfree(vec);
- if (msg)
- kfree(msg);
+ kfree(vec);
+ kfree(msg);
o2net_complete_nsw(nn, &nsw, 0, 0, 0);
return ret;
}
@@ -1011,6 +1186,7 @@ static int o2net_process_message(struct o2net_sock_container *sc,
int ret = 0, handler_status;
enum o2net_system_error syserr;
struct o2net_msg_handler *nmh = NULL;
+ void *ret_data = NULL;
msglog(hdr, "processing message\n");
@@ -1058,22 +1234,33 @@ static int o2net_process_message(struct o2net_sock_container *sc,
if (syserr != O2NET_ERR_NONE)
goto out_respond;
- do_gettimeofday(&sc->sc_tv_func_start);
+ o2net_set_func_start_time(sc);
sc->sc_msg_key = be32_to_cpu(hdr->key);
sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
be16_to_cpu(hdr->data_len),
- nmh->nh_func_data);
- do_gettimeofday(&sc->sc_tv_func_stop);
+ nmh->nh_func_data, &ret_data);
+ o2net_set_func_stop_time(sc);
+
+ o2net_update_recv_stats(sc);
out_respond:
/* this destroys the hdr, so don't use it after this */
+ mutex_lock(&sc->sc_send_lock);
ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
handler_status);
+ mutex_unlock(&sc->sc_send_lock);
hdr = NULL;
mlog(0, "sending handler status %d, syserr %d returned %d\n",
handler_status, syserr, ret);
+ if (nmh) {
+ BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL);
+ if (nmh->nh_post_func)
+ (nmh->nh_post_func)(handler_status, nmh->nh_func_data,
+ ret_data);
+ }
+
out:
if (nmh)
o2net_handler_put(nmh);
@@ -1086,24 +1273,63 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
- mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
- "version %llu but %llu is required, disconnecting\n",
- SC_NODEF_ARGS(sc),
- (unsigned long long)be64_to_cpu(hand->protocol_version),
- O2NET_PROTOCOL_VERSION);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+ "protocol version %llu but %llu is required. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ (unsigned long long)be64_to_cpu(hand->protocol_version),
+ O2NET_PROTOCOL_VERSION);
/* don't bother reconnecting if its the wrong version. */
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
+ /*
+ * Ensure timeouts are consistent with other nodes, otherwise
+ * we can end up with one node thinking that the other must be down,
+ * but isn't. This can ultimately cause corruption.
+ */
+ if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
+ o2net_idle_timeout()) {
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+ "idle timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_idle_timeout_ms),
+ o2net_idle_timeout());
+ o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+ return -1;
+ }
+
+ if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
+ o2net_keepalive_delay()) {
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+ "delay of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
+ o2net_keepalive_delay());
+ o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+ return -1;
+ }
+
+ if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
+ O2HB_MAX_WRITE_TIMEOUT_MS) {
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+ "timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+ O2HB_MAX_WRITE_TIMEOUT_MS);
+ o2net_ensure_shutdown(nn, sc, -ENOTCONN);
+ return -1;
+ }
+
sc->sc_handshake_ok = 1;
spin_lock(&nn->nn_lock);
/* set valid and queue the idle timers only if it hasn't been
* shut down already */
if (nn->nn_sc == sc) {
- o2net_sc_postpone_idle(sc);
+ o2net_sc_reset_idle_timer(sc);
+ atomic_set(&nn->nn_timeout, 0);
o2net_set_nn_state(nn, sc, 1, 0);
}
spin_unlock(&nn->nn_lock);
@@ -1127,7 +1353,24 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
size_t datalen;
sclog(sc, "receiving\n");
- do_gettimeofday(&sc->sc_tv_advance_start);
+ o2net_set_advance_start_time(sc);
+
+ if (unlikely(sc->sc_handshake_ok == 0)) {
+ if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
+ data = page_address(sc->sc_page) + sc->sc_page_off;
+ datalen = sizeof(struct o2net_handshake) - sc->sc_page_off;
+ ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
+ if (ret > 0)
+ sc->sc_page_off += ret;
+ }
+
+ if (sc->sc_page_off == sizeof(struct o2net_handshake)) {
+ o2net_check_handshake(sc);
+ if (unlikely(sc->sc_handshake_ok == 0))
+ ret = -EPROTO;
+ }
+ goto out;
+ }
/* do we need more header? */
if (sc->sc_page_off < sizeof(struct o2net_msg)) {
@@ -1136,15 +1379,6 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
if (ret > 0) {
sc->sc_page_off += ret;
-
- /* this working relies on the handshake being
- * smaller than the normal message header */
- if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
- !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
- ret = -EPROTO;
- goto out;
- }
-
/* only swab incoming here.. we can
* only get here once as we cross from
* being under to over */
@@ -1194,16 +1428,17 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
out:
sclog(sc, "ret = %d\n", ret);
- do_gettimeofday(&sc->sc_tv_advance_stop);
+ o2net_set_advance_stop_time(sc);
return ret;
}
/* this work func is triggerd by data ready. it reads until it can read no
* more. it interprets 0, eof, as fatal. if data_ready hits while we're doing
* our work the work struct will be marked and we'll be called again. */
-static void o2net_rx_until_empty(void *arg)
+static void o2net_rx_until_empty(struct work_struct *work)
{
- struct o2net_sock_container *sc = arg;
+ struct o2net_sock_container *sc =
+ container_of(work, struct o2net_sock_container, sc_rx_work);
int ret;
do {
@@ -1245,26 +1480,42 @@ static int o2net_set_nodelay(struct socket *sock)
return ret;
}
+static void o2net_initialize_handshake(void)
+{
+ o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
+ O2HB_MAX_WRITE_TIMEOUT_MS);
+ o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
+ o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
+ o2net_keepalive_delay());
+ o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
+ o2net_reconnect_delay());
+}
+
/* ------------------------------------------------------------ */
/* called when a connect completes and after a sock is accepted. the
* rx path will see the response and mark the sc valid */
-static void o2net_sc_connect_completed(void *arg)
+static void o2net_sc_connect_completed(struct work_struct *work)
{
- struct o2net_sock_container *sc = arg;
+ struct o2net_sock_container *sc =
+ container_of(work, struct o2net_sock_container,
+ sc_connect_work);
mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
(unsigned long long)O2NET_PROTOCOL_VERSION,
(unsigned long long)be64_to_cpu(o2net_hand->connector_id));
+ o2net_initialize_handshake();
o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
sc_put(sc);
}
/* this is called as a work_struct func. */
-static void o2net_sc_send_keep_req(void *arg)
+static void o2net_sc_send_keep_req(struct work_struct *work)
{
- struct o2net_sock_container *sc = arg;
+ struct o2net_sock_container *sc =
+ container_of(work, struct o2net_sock_container,
+ sc_keepalive_work.work);
o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
sc_put(sc);
@@ -1276,37 +1527,42 @@ static void o2net_sc_send_keep_req(void *arg)
static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
- struct timeval now;
-
- do_gettimeofday(&now);
-
- printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 "
- "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
- mlog(ML_NOTICE, "here are some times that might help debug the "
- "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
- "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
- sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
- now.tv_sec, (long) now.tv_usec,
- sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
- sc->sc_tv_advance_start.tv_sec,
- (long) sc->sc_tv_advance_start.tv_usec,
- sc->sc_tv_advance_stop.tv_sec,
- (long) sc->sc_tv_advance_stop.tv_usec,
- sc->sc_msg_key, sc->sc_msg_type,
- sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
- sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+ struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+#ifdef CONFIG_DEBUG_FS
+ unsigned long msecs = ktime_to_ms(ktime_get()) -
+ ktime_to_ms(sc->sc_tv_timer);
+#else
+ unsigned long msecs = o2net_idle_timeout();
+#endif
+
+ printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+ "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+ msecs / 1000, msecs % 1000);
+
+ /*
+ * Initialize the nn_timeout so that the next connection attempt
+ * will continue in o2net_start_connect.
+ */
+ atomic_set(&nn->nn_timeout, 1);
o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
}
-static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
{
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
- O2NET_KEEPALIVE_DELAY_SECS * HZ);
- do_gettimeofday(&sc->sc_tv_timer);
+ msecs_to_jiffies(o2net_keepalive_delay()));
+ o2net_set_sock_timer(sc);
mod_timer(&sc->sc_idle_timeout,
- jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
+ jiffies + msecs_to_jiffies(o2net_idle_timeout()));
+}
+
+static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
+{
+ /* Only push out an existing timer */
+ if (timer_pending(&sc->sc_idle_timeout))
+ o2net_sc_reset_idle_timer(sc);
}
/* this work func is kicked whenever a path sets the nn state which doesn't
@@ -1314,14 +1570,16 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
* having a connect attempt fail, etc. This centralizes the logic which decides
* if a connect attempt should be made or if we should give up and all future
* transmit attempts should fail */
-static void o2net_start_connect(void *arg)
+static void o2net_start_connect(struct work_struct *work)
{
- struct o2net_node *nn = arg;
+ struct o2net_node *nn =
+ container_of(work, struct o2net_node, nn_connect_work.work);
struct o2net_sock_container *sc = NULL;
struct o2nm_node *node = NULL, *mynode = NULL;
struct socket *sock = NULL;
struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
- int ret = 0;
+ int ret = 0, stop;
+ unsigned int timeout;
/* if we're greater we initiate tx, otherwise we accept */
if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1341,11 +1599,19 @@ static void o2net_start_connect(void *arg)
}
spin_lock(&nn->nn_lock);
- /* see if we already have one pending or have given up */
- if (nn->nn_sc || nn->nn_persistent_error)
- arg = NULL;
+ /*
+ * see if we already have one pending or have given up.
+ * For nn_timeout, it is set when we close the connection
+ * because of the idle time out. So it means that we have
+ * at least connected to that node successfully once,
+ * now try to connect to it again.
+ */
+ timeout = atomic_read(&nn->nn_timeout);
+ stop = (nn->nn_sc ||
+ (nn->nn_persistent_error &&
+ (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
spin_unlock(&nn->nn_lock);
- if (arg == NULL) /* *shrug*, needed some indicator */
+ if (stop)
goto out;
nn->nn_last_connect_attempt = jiffies;
@@ -1367,14 +1633,14 @@ static void o2net_start_connect(void *arg)
sock->sk->sk_allocation = GFP_ATOMIC;
myaddr.sin_family = AF_INET;
- myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address;
- myaddr.sin_port = (__force u16)htons(0); /* any port */
+ myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
+ myaddr.sin_port = htons(0); /* any port */
ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
sizeof(myaddr));
if (ret) {
- mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n",
- ret, NIPQUAD(mynode->nd_ipv4_address));
+ mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
+ ret, &mynode->nd_ipv4_address);
goto out;
}
@@ -1392,8 +1658,8 @@ static void o2net_start_connect(void *arg)
spin_unlock(&nn->nn_lock);
remoteaddr.sin_family = AF_INET;
- remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
- remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
+ remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
+ remoteaddr.sin_port = node->nd_ipv4_port;
ret = sc->sc_sock->ops->connect(sc->sc_sock,
(struct sockaddr *)&remoteaddr,
@@ -1403,13 +1669,12 @@ static void o2net_start_connect(void *arg)
ret = 0;
out:
- if (ret) {
- mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
- "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+ if (ret && sc) {
+ printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+ " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
- if (sc)
- o2net_ensure_shutdown(nn, sc, 0);
+ o2net_ensure_shutdown(nn, sc, 0);
}
if (sc)
sc_put(sc);
@@ -1421,24 +1686,28 @@ out:
return;
}
-static void o2net_connect_expired(void *arg)
+static void o2net_connect_expired(struct work_struct *work)
{
- struct o2net_node *nn = arg;
+ struct o2net_node *nn =
+ container_of(work, struct o2net_node, nn_connect_expired.work);
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
- mlog(ML_ERROR, "no connection established with node %u after "
- "%u seconds, giving up and returning errors.\n",
- o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
+ printk(KERN_NOTICE "o2net: No connection established with "
+ "node %u after %u.%u seconds, giving up.\n",
+ o2net_num_from_nn(nn),
+ o2net_idle_timeout() / 1000,
+ o2net_idle_timeout() % 1000);
o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
}
spin_unlock(&nn->nn_lock);
}
-static void o2net_still_up(void *arg)
+static void o2net_still_up(struct work_struct *work)
{
- struct o2net_node *nn = arg;
+ struct o2net_node *nn =
+ container_of(work, struct o2net_node, nn_still_up.work);
o2quo_hb_still_up(o2net_num_from_nn(nn));
}
@@ -1451,6 +1720,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
/* don't reconnect until it's heartbeating again */
spin_lock(&nn->nn_lock);
+ atomic_set(&nn->nn_timeout, 0);
o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
spin_unlock(&nn->nn_lock);
@@ -1467,8 +1737,13 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
{
o2quo_hb_down(node_num);
+ if (!node)
+ return;
+
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
+
+ BUG_ON(atomic_read(&o2net_connected_peers) < 0);
}
static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
@@ -1478,22 +1753,19 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
o2quo_hb_up(node_num);
+ BUG_ON(!node);
+
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
- (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
+ (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
if (node_num != o2nm_this_node()) {
- /* heartbeat doesn't work unless a local node number is
- * configured and doing so brings up the o2net_wq, so we can
- * use it.. */
- queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
- O2NET_IDLE_TIMEOUT_SECS * HZ);
-
/* believe it or not, accept and node hearbeating testing
* can succeed for this node before we got here.. so
* only use set_nn_state to clear the persistent error
* if that hasn't already happened */
spin_lock(&nn->nn_lock);
+ atomic_set(&nn->nn_timeout, 0);
if (nn->nn_persistent_error)
o2net_set_nn_state(nn, NULL, 0, 0);
spin_unlock(&nn->nn_lock);
@@ -1502,17 +1774,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
void o2net_unregister_hb_callbacks(void)
{
- int ret;
-
- ret = o2hb_unregister_callback(&o2net_hb_up);
- if (ret < 0)
- mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
- "callback!\n", ret);
-
- ret = o2hb_unregister_callback(&o2net_hb_down);
- if (ret < 0)
- mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
- "callback!\n", ret);
+ o2hb_unregister_callback(NULL, &o2net_hb_up);
+ o2hb_unregister_callback(NULL, &o2net_hb_down);
}
int o2net_register_hb_callbacks(void)
@@ -1524,9 +1787,9 @@ int o2net_register_hb_callbacks(void)
o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
- ret = o2hb_register_callback(&o2net_hb_up);
+ ret = o2hb_register_callback(NULL, &o2net_hb_up);
if (ret == 0)
- ret = o2hb_register_callback(&o2net_hb_down);
+ ret = o2hb_register_callback(NULL, &o2net_hb_down);
if (ret)
o2net_unregister_hb_callbacks();
@@ -1536,16 +1799,18 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
+ struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1557,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1571,20 +1837,27 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
- node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
+ node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
if (node == NULL) {
- mlog(ML_NOTICE, "attempt to connect from unknown node at "
- "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
- ntohs((__force __be16)sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+ "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
- if (o2nm_this_node() > node->nd_num) {
- mlog(ML_NOTICE, "unexpected connect attempted from a lower "
- "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
- node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
- ntohs((__force __be16)sin.sin_port), node->nd_num);
+ if (o2nm_this_node() >= node->nd_num) {
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ if (local_node)
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt "
+ "seen at node '%s' (%u, %pI4:%d) from "
+ "node '%s' (%u, %pI4:%d)\n",
+ local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port),
+ node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1593,9 +1866,9 @@ static int o2net_accept_one(struct socket *sock)
* and tries to connect before we see their heartbeat */
if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
mlog(ML_CONN, "attempt to connect from node '%s' at "
- "%u.%u.%u.%u:%d but it isn't heartbeating\n",
- node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
- ntohs((__force __be16)sin.sin_port));
+ "%pI4:%d but it isn't heartbeating\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1609,10 +1882,10 @@ static int o2net_accept_one(struct socket *sock)
ret = 0;
spin_unlock(&nn->nn_lock);
if (ret) {
- mlog(ML_NOTICE, "attempt to connect from node '%s' at "
- "%u.%u.%u.%u:%d but it already has an open connection\n",
- node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
- ntohs((__force __be16)sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+ "at %pI4:%d but it already has an open connection\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
goto out;
}
@@ -1626,12 +1899,14 @@ static int o2net_accept_one(struct socket *sock)
new_sock = NULL;
spin_lock(&nn->nn_lock);
+ atomic_set(&nn->nn_timeout, 0);
o2net_set_nn_state(nn, sc, 0, 0);
spin_unlock(&nn->nn_lock);
o2net_register_callbacks(sc->sc_sock->sk, sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
+ o2net_initialize_handshake();
o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
out:
@@ -1639,21 +1914,48 @@ out:
sock_release(new_sock);
if (node)
o2nm_node_put(node);
+ if (local_node)
+ o2nm_node_put(local_node);
if (sc)
sc_put(sc);
return ret;
}
-static void o2net_accept_many(void *arg)
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
+static void o2net_accept_many(struct work_struct *work)
{
- struct socket *sock = arg;
- while (o2net_accept_one(sock) == 0)
+ struct socket *sock = o2net_listen_sock;
+ int more;
+ int err;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data;
@@ -1662,33 +1964,44 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
goto out;
}
- /* ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the acceptor has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket */
+ /* This callback may called twice when a new connection
+ * is being established as a child socket inherits everything
+ * from a parent LISTEN socket, including the data_ready cb of
+ * the parent. This leads to a hazard. In o2net_accept_one()
+ * we are still initializing the child socket but have not
+ * changed the inherited data_ready callback yet when
+ * data starts arriving.
+ * We avoid this hazard by checking the state.
+ * For the listening socket, the state will be TCP_LISTEN; for the new
+ * socket, will be TCP_ESTABLISHED. Also, in this case,
+ * sk->sk_user_data is not a valid function pointer.
+ */
+
if (sk->sk_state == TCP_LISTEN) {
- mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
+ } else {
+ ready = NULL;
}
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ if (ready != NULL)
+ ready(sk);
}
-static int o2net_open_listening_sock(__be16 port)
+static int o2net_open_listening_sock(__be32 addr, __be16 port)
{
struct socket *sock = NULL;
int ret;
struct sockaddr_in sin = {
.sin_family = PF_INET,
- .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
- .sin_port = (__force u16)port,
+ .sin_addr = { .s_addr = addr },
+ .sin_port = port,
};
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0) {
- mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+ printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
goto out;
}
@@ -1700,21 +2013,20 @@ static int o2net_open_listening_sock(__be16 port)
write_unlock_bh(&sock->sk->sk_callback_lock);
o2net_listen_sock = sock;
- INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
+ INIT_WORK(&o2net_listen_work, o2net_accept_many);
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
- mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
- ntohs(port), ret);
+ printk(KERN_ERR "o2net: Error %d while binding socket at "
+ "%pI4:%u\n", ret, &addr, ntohs(port));
goto out;
}
ret = sock->ops->listen(sock, 64);
- if (ret < 0) {
- mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
- ntohs(port), ret);
- }
+ if (ret < 0)
+ printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+ ret, &addr, ntohs(port));
out:
if (ret) {
@@ -1746,7 +2058,8 @@ int o2net_start_listening(struct o2nm_node *node)
return -ENOMEM; /* ? */
}
- ret = o2net_open_listening_sock(node->nd_ipv4_port);
+ ret = o2net_open_listening_sock(node->nd_ipv4_address,
+ node->nd_ipv4_port);
if (ret) {
destroy_workqueue(o2net_wq);
o2net_wq = NULL;
@@ -1799,9 +2112,12 @@ int o2net_init(void)
o2quo_init();
- o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
- o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
- o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
+ if (o2net_debugfs_init())
+ return -ENOMEM;
+
+ o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
+ o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
+ o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
kfree(o2net_hand);
kfree(o2net_keep_req);
@@ -1818,10 +2134,12 @@ int o2net_init(void)
for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
struct o2net_node *nn = o2net_nn_from_num(i);
+ atomic_set(&nn->nn_timeout, 0);
spin_lock_init(&nn->nn_lock);
- INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
- INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
- INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
+ INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
+ INIT_DELAYED_WORK(&nn->nn_connect_expired,
+ o2net_connect_expired);
+ INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up);
/* until we see hb from a node we'll return einval */
nn->nn_persistent_error = -ENOTCONN;
init_waitqueue_head(&nn->nn_sc_wq);
@@ -1838,4 +2156,5 @@ void o2net_exit(void)
kfree(o2net_hand);
kfree(o2net_keep_req);
kfree(o2net_keep_resp);
+ o2net_debugfs_exit();
}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 616ff2b8434..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -50,10 +50,20 @@ struct o2net_msg
__u8 buf[0];
};
-typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
+typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+typedef void (o2net_post_msg_handler_func)(int status, void *data,
+ void *ret_data);
#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
+/* same as hb delay, we're waiting for another node to recognize our hb */
+#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
+
+#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
+#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
+
+
/* TODO: figure this out.... */
static inline int o2net_link_down(int err, struct socket *sock)
{
@@ -92,17 +102,53 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
o2net_msg_handler_func *func, void *data,
+ o2net_post_msg_handler_func *post_func,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
int o2net_start_listening(struct o2nm_node *node);
void o2net_stop_listening(struct o2nm_node *node);
void o2net_disconnect_node(struct o2nm_node *node);
+int o2net_num_connected_peers(void);
int o2net_init(void);
void o2net_exit(void);
+struct o2net_send_tracking;
+struct o2net_sock_container;
+
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static inline int o2net_debugfs_init(void)
+{
+ return 0;
+}
+static inline void o2net_debugfs_exit(void)
+{
+}
+static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4b46aac7d24..dc024367110 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -27,23 +27,44 @@
#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
-/* same as hb delay, we're waiting for another node to recognize our hb */
-#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS
-
/* we're delaying our quorum decision so that heartbeat will have timed
* out truly dead nodes by the time we come around to making decisions
* on their number */
#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
-#define O2NET_KEEPALIVE_DELAY_SECS 5
-#define O2NET_IDLE_TIMEOUT_SECS 10
-
-/*
+/*
* This version number represents quite a lot, unfortunately. It not
* only represents the raw network message protocol on the wire but also
- * locking semantics of the file system using the protocol. It should
+ * locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
+ * With version 11, we separate out the filesystem locking portion. The
+ * filesystem now has a major.minor version it negotiates. Version 11
+ * introduces this negotiation to the o2dlm protocol, and as such the
+ * version here in tcp_internal.h should not need to be bumped for
+ * filesystem locking changes.
+ *
+ * New in version 11
+ * - Negotiation of filesystem locking in the dlm join.
+ *
+ * New in version 10:
+ * - Meta/data locks combined
+ *
+ * New in version 9:
+ * - All votes removed
+ *
+ * New in version 8:
+ * - Replace delete inode votes with a cluster lock
+ *
+ * New in version 7:
+ * - DLM join domain includes the live nodemap
+ *
+ * New in version 6:
+ * - DLM lockres remote refcount fixes.
+ *
+ * New in version 5:
+ * - Network timeout checking protocol
+ *
* New in version 4:
* - Remove i_generation from lock names for better stat performance.
*
@@ -54,10 +75,14 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 4ULL
+#define O2NET_PROTOCOL_VERSION 11ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
+ __be32 o2hb_heartbeat_timeout_ms;
+ __be32 o2net_idle_timeout_ms;
+ __be32 o2net_keepalive_delay_ms;
+ __be32 o2net_reconnect_delay_ms;
};
struct o2net_node {
@@ -70,6 +95,8 @@ struct o2net_node {
unsigned nn_sc_valid:1;
/* if this is set tx just returns it */
int nn_persistent_error;
+ /* It is only set to 1 after the idle time out. */
+ atomic_t nn_timeout;
/* threads waiting for an sc to arrive wait on the wq for generation
* to increase. it is increased when a connecting socket succeeds
@@ -86,23 +113,23 @@ struct o2net_node {
* connect attempt fails and so can be self-arming. shutdown is
* careful to first mark the nn such that no connects will be attempted
* before canceling delayed connect work and flushing the queue. */
- struct work_struct nn_connect_work;
+ struct delayed_work nn_connect_work;
unsigned long nn_last_connect_attempt;
/* this is queued as nodes come up and is canceled when a connection is
* established. this expiring gives up on the node and errors out
* transmits */
- struct work_struct nn_connect_expired;
+ struct delayed_work nn_connect_expired;
/* after we give up on a socket we wait a while before deciding
* that it is still heartbeating and that we should do some
* quorum work */
- struct work_struct nn_still_up;
+ struct delayed_work nn_still_up;
};
struct o2net_sock_container {
struct kref sc_kref;
- /* the next two are vaild for the life time of the sc */
+ /* the next two are valid for the life time of the sc */
struct socket *sc_sock;
struct o2nm_node *sc_node;
@@ -129,7 +156,7 @@ struct o2net_sock_container {
struct work_struct sc_shutdown_work;
struct timer_list sc_idle_timeout;
- struct work_struct sc_keepalive_work;
+ struct delayed_work sc_keepalive_work;
unsigned sc_handshake_ok:1;
@@ -138,16 +165,29 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
- void (*sc_data_ready)(struct sock *sk, int bytes);
-
- struct timeval sc_tv_timer;
- struct timeval sc_tv_data_ready;
- struct timeval sc_tv_advance_start;
- struct timeval sc_tv_advance_stop;
- struct timeval sc_tv_func_start;
- struct timeval sc_tv_func_stop;
+ void (*sc_data_ready)(struct sock *sk);
+
u32 sc_msg_key;
u16 sc_msg_type;
+
+#ifdef CONFIG_DEBUG_FS
+ struct list_head sc_net_debug_item;
+ ktime_t sc_tv_timer;
+ ktime_t sc_tv_data_ready;
+ ktime_t sc_tv_advance_start;
+ ktime_t sc_tv_advance_stop;
+ ktime_t sc_tv_func_start;
+ ktime_t sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+ ktime_t sc_tv_acquiry_total;
+ ktime_t sc_tv_send_total;
+ ktime_t sc_tv_status_total;
+ u32 sc_send_count;
+ u32 sc_recv_count;
+ ktime_t sc_tv_process_total;
+#endif
+ struct mutex sc_send_lock;
};
struct o2net_msg_handler {
@@ -157,6 +197,8 @@ struct o2net_msg_handler {
u32 nh_key;
o2net_msg_handler_func *nh_func;
o2net_msg_handler_func *nh_func_data;
+ o2net_post_msg_handler_func
+ *nh_post_func;
struct kref nh_kref;
struct list_head nh_unregister_item;
};
@@ -177,4 +219,24 @@ struct o2net_status_wait {
struct list_head ns_node_item;
};
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+ struct list_head st_net_debug_item;
+ struct task_struct *st_task;
+ struct o2net_sock_container *st_sc;
+ u32 st_id;
+ u32 st_msg_type;
+ u32 st_msg_key;
+ u8 st_node;
+ ktime_t st_sock_time;
+ ktime_t st_send_time;
+ ktime_t st_status_time;
+};
+#else
+struct o2net_send_tracking {
+ u32 dummy;
+};
+#endif /* CONFIG_DEBUG_FS */
+
#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index 7286c48bb30..00000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.3.3"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c..00000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 014e73978da..e2e05a106be 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
#include <linux/slab.h>
#include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_DCACHE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -38,23 +37,48 @@
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
+#include "ocfs2_trace.h"
+
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+ unsigned long gen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ BUG_ON(dentry->d_inode);
+ dentry->d_fsdata = (void *)gen;
+}
-static int ocfs2_dentry_revalidate(struct dentry *dentry,
- struct nameidata *nd)
+static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode;
int ret = 0; /* if all else fails, just return false */
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
- mlog_entry("(0x%p, '%.*s')\n", dentry,
- dentry->d_name.len, dentry->d_name.name);
+ inode = dentry->d_inode;
+ osb = OCFS2_SB(dentry->d_sb);
- /* Never trust a negative dentry - force a new lookup. */
+ trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+ dentry->d_name.name);
+
+ /* For a negative dentry -
+ * check the generation number of the parent and compare with the
+ * one stored in the inode.
+ */
if (inode == NULL) {
- mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
- dentry->d_name.name);
- goto bail;
+ unsigned long gen = (unsigned long) dentry->d_fsdata;
+ unsigned long pgen;
+ spin_lock(&dentry->d_lock);
+ pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ spin_unlock(&dentry->d_lock);
+ trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+ dentry->d_name.name,
+ pgen, gen);
+ if (gen != pgen)
+ goto bail;
+ goto valid;
}
BUG_ON(!osb);
@@ -66,8 +90,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
/* did we or someone else delete this inode? */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
- mlog(0, "inode (%llu) deleted, returning false\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_dentry_revalidate_delete(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -77,18 +101,27 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
* inode nlink hits zero, it never goes back.
*/
if (inode->i_nlink == 0) {
- mlog(0, "Inode %llu orphaned, returning false "
- "dir = %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- S_ISDIR(inode->i_mode));
+ trace_ocfs2_dentry_revalidate_orphaned(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ S_ISDIR(inode->i_mode));
goto bail;
}
+ /*
+ * If the last lookup failed to create dentry lock, let us
+ * redo it.
+ */
+ if (!dentry->d_fsdata) {
+ trace_ocfs2_dentry_revalidate_nofsdata(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ goto bail;
+ }
+
+valid:
ret = 1;
bail:
- mlog_exit(ret);
-
+ trace_ocfs2_dentry_revalidate_ret(ret);
return ret;
}
@@ -128,36 +161,32 @@ static int ocfs2_match_dentry(struct dentry *dentry,
/*
* Walk the inode alias list, and find a dentry which has a given
* parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
- * is looking for a dentry_lock reference. The vote thread is looking
- * to unhash aliases, so we allow it to skip any that already have
- * that property.
+ * is looking for a dentry_lock reference. The downconvert thread is
+ * looking to unhash aliases, so we allow it to skip any that already
+ * have that property.
*/
struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
int skip_unhashed)
{
- struct list_head *p;
- struct dentry *dentry = NULL;
-
- spin_lock(&dcache_lock);
-
- list_for_each(p, &inode->i_dentry) {
- dentry = list_entry(p, struct dentry, d_alias);
+ struct dentry *dentry;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
- mlog(0, "dentry found: %.*s\n",
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_find_local_alias(dentry->d_name.len,
+ dentry->d_name.name);
- dget_locked(dentry);
- break;
+ dget_dlock(dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&inode->i_lock);
+ return dentry;
}
-
- dentry = NULL;
+ spin_unlock(&dentry->d_lock);
}
-
- spin_unlock(&dcache_lock);
-
- return dentry;
+ spin_unlock(&inode->i_lock);
+ return NULL;
}
DEFINE_SPINLOCK(dentry_attach_lock);
@@ -202,9 +231,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
struct dentry *alias;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
- mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
- dentry->d_name.len, dentry->d_name.name,
- (unsigned long long)parent_blkno, dl);
+ trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)parent_blkno, dl);
/*
* Negative dentry. We ignore these for now.
@@ -215,6 +243,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
+ if (!dentry->d_inode && dentry->d_fsdata) {
+ /* Converting a negative dentry to positive
+ Clear dentry->d_fsdata */
+ dentry->d_fsdata = dl = NULL;
+ }
+
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -248,7 +282,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
- mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+ trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+ (unsigned long long)parent_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto out_attach;
}
@@ -266,7 +302,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
dl->dl_count = 0;
/*
* Does this have to happen below, for all attaches, in case
- * the struct inode gets blown away by votes?
+ * the struct inode gets blown away by the downconvert thread?
*/
dl->dl_inode = igrab(inode);
dl->dl_parent_blkno = parent_blkno;
@@ -289,6 +325,21 @@ out_attach:
else
mlog_errno(ret);
+ /*
+ * In case of error, manually free the allocation and do the iput().
+ * We need to do this because error here means no d_instantiate(),
+ * which means iput() will not be called during dput(dentry).
+ */
+ if (ret < 0 && !alias) {
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+ }
+
dput(alias);
return ret;
@@ -318,9 +369,9 @@ out_attach:
static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
+ iput(dl->dl_inode);
ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
ocfs2_lock_res_free(&dl->dl_lockres);
- iput(dl->dl_inode);
kfree(dl);
}
@@ -344,12 +395,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
{
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
- mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED),
- "dentry: %.*s\n", dentry->d_name.len,
- dentry->d_name.name);
+ if (!dl) {
+ /*
+ * No dentry lock is ok if we're disconnected or
+ * unhashed.
+ */
+ if (!(dentry->d_flags & DCACHE_DISCONNECTED) &&
+ !d_unhashed(dentry)) {
+ unsigned long long ino = 0ULL;
+ if (inode)
+ ino = (unsigned long long)OCFS2_I(inode)->ip_blkno;
+ mlog(ML_ERROR, "Dentry is missing cluster lock. "
+ "inode: %llu, d_flags: 0x%x, d_name: %.*s\n",
+ ino, dentry->d_flags, dentry->d_name.len,
+ dentry->d_name.name);
+ }
- if (!dl)
goto out;
+ }
mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
dentry->d_name.len, dentry->d_name.name,
@@ -376,7 +439,7 @@ out:
* directory locks. The dentries have already been deleted on other
* nodes via ocfs2_remote_dentry_delete().
*
- * Normally, the VFS handles the d_move() for the file sytem, after
+ * Normally, the VFS handles the d_move() for the file system, after
* the ->rename() callback. OCFS2 wants to handle this internally, so
* the new lock can be created atomically with respect to the cluster.
*/
@@ -407,7 +470,7 @@ out_move:
d_move(dentry, target);
}
-struct dentry_operations ocfs2_dentry_ops = {
+const struct dentry_operations ocfs2_dentry_ops = {
.d_revalidate = ocfs2_dentry_revalidate,
.d_iput = ocfs2_dentry_iput,
};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d988..55f58892b15 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -26,7 +26,7 @@
#ifndef OCFS2_DCACHE_H
#define OCFS2_DCACHE_H
-extern struct dentry_operations ocfs2_dentry_ops;
+extern const struct dentry_operations ocfs2_dentry_ops;
struct ocfs2_dentry_lock {
unsigned int dl_count;
@@ -54,5 +54,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir);
extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 04e01915b86..0717662b4ae 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,13 +40,15 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/quotaops.h>
+#include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -55,60 +57,1803 @@
#include "journal.h"
#include "namei.h"
#include "suballoc.h"
+#include "super.h"
+#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
+#define NAMEI_RA_CHUNKS 2
+#define NAMEI_RA_BLOCKS 4
+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+
static unsigned char ocfs2_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
- struct inode *dir,
- struct buffer_head *parent_fe_bh,
- struct buffer_head **new_de_bh);
+static int ocfs2_do_extend_dir(struct super_block *sb,
+ handle_t *handle,
+ struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
+
/*
- * ocfs2_readdir()
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_supports_dir_trailer(struct inode *dir)
+{
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
+}
+
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
+{
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+ return ocfs2_meta_ecc(osb) ||
+ ocfs2_supports_indexed_dirs(osb);
+}
+
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+ return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
+
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
+
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+ void *data)
+{
+ char *p = data;
+
+ p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+ return (struct ocfs2_dir_block_trailer *)p;
+}
+
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+ struct ocfs2_dir_entry *de,
+ unsigned long offset,
+ unsigned long blklen)
+{
+ unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
+
+ if (!ocfs2_supports_dir_trailer(dir))
+ return 0;
+
+ if (offset != toff)
+ return 0;
+
+ return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+ struct buffer_head *bh, u16 rec_len)
+{
+ struct ocfs2_dir_block_trailer *trailer;
+
+ trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+ strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+ trailer->db_compat_rec_len =
+ cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+ trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+ trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+ trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+ struct buffer_head *dx_root_bh,
+ struct buffer_head *dirdata_bh)
+{
+ int ret;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dir_block_trailer *trailer;
+
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+ trailer->db_free_next = dx_root->dr_free_blk;
+ dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+
+ ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+ return ret;
+}
+
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+ return res->dl_prev_leaf_bh == NULL;
+}
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+ brelse(res->dl_dx_root_bh);
+ brelse(res->dl_leaf_bh);
+ brelse(res->dl_dx_leaf_bh);
+ brelse(res->dl_prev_leaf_bh);
+}
+
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+ return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+ __u32 sum = 0;
+ __u32 b0 = buf[0], b1 = buf[1];
+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
+ int n = 16;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+ } while (--n);
+
+ buf[0] += b0;
+ buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = msg[i] + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+ struct ocfs2_dx_hinfo *hinfo)
+{
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ const char *p;
+ __u32 in[8], buf[4];
+
+ /*
+ * XXX: Is this really necessary, if the index is never looked
+ * at by readdir? Is a hash value of '0' a bad idea?
+ */
+ if ((len == 1 && !strncmp(".", name, 1)) ||
+ (len == 2 && !strncmp("..", name, 2))) {
+ buf[0] = buf[1] = 0;
+ goto out;
+ }
+
+#ifdef OCFS2_DEBUG_DX_DIRS
+ /*
+ * This makes it very easy to debug indexing problems. We
+ * should never allow this to be selected without hand editing
+ * this file though.
+ */
+ buf[0] = buf[1] = len;
+ goto out;
+#endif
+
+ memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+
+ p = name;
+ while (len > 0) {
+ str2hashbuf(p, len, in, 4);
+ TEA_transform(buf, in);
+ len -= 16;
+ p += 16;
+ }
+
+out:
+ hinfo->major_hash = buf[0];
+ hinfo->minor_hash = buf[1];
+}
+
+/*
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
+static int ocfs2_check_dir_entry(struct inode * dir,
+ struct ocfs2_dir_entry * de,
+ struct buffer_head * bh,
+ unsigned long offset)
+{
+ const char *error_msg = NULL;
+ const int rlen = le16_to_cpu(de->rec_len);
+
+ if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
+ error_msg = "rec_len is smaller than minimal";
+ else if (unlikely(rlen % 4 != 0))
+ error_msg = "rec_len % 4 != 0";
+ else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
+ error_msg = "rec_len is too small for name_len";
+ else if (unlikely(
+ ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
+ error_msg = "directory entry across blocks";
+
+ if (unlikely(error_msg != NULL))
+ mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
+ "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
+ offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
+ de->name_len);
+
+ return error_msg == NULL ? 1 : 0;
+}
+
+static inline int ocfs2_match(int len,
+ const char * const name,
+ struct ocfs2_dir_entry *de)
+{
+ if (len != de->name_len)
+ return 0;
+ if (!de->inode)
+ return 0;
+ return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
+ struct inode *dir,
+ const char *name, int namelen,
+ unsigned long offset,
+ char *first_de,
+ unsigned int bytes,
+ struct ocfs2_dir_entry **res_dir)
+{
+ struct ocfs2_dir_entry *de;
+ char *dlimit, *de_buf;
+ int de_len;
+ int ret = 0;
+
+ de_buf = first_de;
+ dlimit = de_buf + bytes;
+
+ while (de_buf < dlimit) {
+ /* this code is executed quadratically often */
+ /* do minimal checking `by hand' */
+
+ de = (struct ocfs2_dir_entry *) de_buf;
+
+ if (de_buf + namelen <= dlimit &&
+ ocfs2_match(namelen, name, de)) {
+ /* found a match - just to be sure, do a full check */
+ if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+ ret = -1;
+ goto bail;
+ }
+ *res_dir = de;
+ ret = 1;
+ goto bail;
+ }
+
+ /* prevent looping on a bad block */
+ de_len = le16_to_cpu(de->rec_len);
+ if (de_len <= 0) {
+ ret = -1;
+ goto bail;
+ }
+
+ de_buf += de_len;
+ offset += de_len;
+ }
+
+bail:
+ trace_ocfs2_search_dirblock(ret);
+ return ret;
+}
+
+static struct buffer_head *ocfs2_find_entry_id(const char *name,
+ int namelen,
+ struct inode *dir,
+ struct ocfs2_dir_entry **res_dir)
+{
+ int ret, found;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_inline_data *data;
+
+ ret = ocfs2_read_inode_block(dir, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ data = &di->id2.i_data;
+
+ found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
+ data->id_data, i_size_read(dir), res_dir);
+ if (found == 1)
+ return di_bh;
+
+ brelse(di_bh);
+out:
+ return NULL;
+}
+
+static int ocfs2_validate_dir_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_trailer_from_bh(bh, sb);
+
+
+ /*
+ * We don't validate dirents here, that's handled
+ * in-place when the code walks them.
+ */
+ trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ *
+ * Note that we are safe to call this even if the directory
+ * doesn't have a trailer. Filesystems without metaecc will do
+ * nothing, and filesystems with it will have one.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+ if (rc)
+ mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ return rc;
+}
+
+/*
+ * Validate a directory trailer.
*
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
*/
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dir_block_trailer *trailer;
+
+ trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+ if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+ rc = -EINVAL;
+ ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: "
+ "signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
+ goto out;
+ }
+ if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+ rc = -EINVAL;
+ ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid "
+ "db_blkno of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ goto out;
+ }
+ if (le64_to_cpu(trailer->db_parent_dinode) !=
+ OCFS2_I(dir)->ip_blkno) {
+ rc = -EINVAL;
+ ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode "
+ "#%llu has an invalid parent_dinode "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread(). We haven't audited what returning the
+ * real error codes would do to callers. We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh, int flags)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+ ocfs2_validate_dir_block);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!(flags & OCFS2_BH_READAHEAD) &&
+ ocfs2_supports_dir_trailer(inode)) {
+ rc = ocfs2_check_dir_trailer(inode, tmp);
+ if (rc) {
+ if (!*bh)
+ brelse(tmp);
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!*bh)
+ *bh = tmp;
+
+out:
+ return rc ? -EIO : 0;
+}
+
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+ struct buffer_head **bh)
+{
+ int ret;
+ struct buffer_head *tmp = *bh;
+
+ ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
+ ocfs2_validate_dir_block);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_supports_dir_trailer(dir)) {
+ ret = ocfs2_check_dir_trailer(dir, tmp);
+ if (ret) {
+ if (!*bh)
+ brelse(tmp);
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (!ret && !*bh)
+ *bh = tmp;
+out:
+ return ret;
+}
+
+static int ocfs2_validate_dx_root(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int ret;
+ struct ocfs2_dx_root_block *dx_root;
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+
+ ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+ if (ret) {
+ mlog(ML_ERROR,
+ "Checksum failed for dir index root block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return ret;
+ }
+
+ if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+ ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+ struct buffer_head **dx_root_bh)
+{
+ int ret;
+ u64 blkno = le64_to_cpu(di->i_dx_root);
+ struct buffer_head *tmp = *dx_root_bh;
+
+ ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+ ocfs2_validate_dx_root);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!ret && !*dx_root_bh)
+ *dx_root_bh = tmp;
+
+ return ret;
+}
+
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int ret;
+ struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+ if (ret) {
+ mlog(ML_ERROR,
+ "Checksum failed for dir index leaf block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return ret;
+ }
+
+ if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+ ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+ 7, dx_leaf->dl_signature);
+ return -EROFS;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+ struct buffer_head **dx_leaf_bh)
+{
+ int ret;
+ struct buffer_head *tmp = *dx_leaf_bh;
+
+ ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+ ocfs2_validate_dx_leaf);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!ret && !*dx_leaf_bh)
+ *dx_leaf_bh = tmp;
+
+ return ret;
+}
+
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+ struct buffer_head **dx_leaf_bhs)
+{
+ int ret;
+
+ ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
+ ocfs2_validate_dx_leaf);
+ if (ret)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+ struct inode *dir,
+ struct ocfs2_dir_entry **res_dir)
+{
+ struct super_block *sb;
+ struct buffer_head *bh_use[NAMEI_RA_SIZE];
+ struct buffer_head *bh, *ret = NULL;
+ unsigned long start, block, b;
+ int ra_max = 0; /* Number of bh's in the readahead
+ buffer, bh_use[] */
+ int ra_ptr = 0; /* Current index into readahead
+ buffer */
+ int num = 0;
+ int nblocks, i, err;
+
+ sb = dir->i_sb;
+
+ nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+ start = OCFS2_I(dir)->ip_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+
+restart:
+ do {
+ /*
+ * We deal with the read-ahead logic here.
+ */
+ if (ra_ptr >= ra_max) {
+ /* Refill the readahead buffer */
+ ra_ptr = 0;
+ b = block;
+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+ /*
+ * Terminate if we reach the end of the
+ * directory and must wrap, or if our
+ * search has finished at this block.
+ */
+ if (b >= nblocks || (num && block == start)) {
+ bh_use[ra_max] = NULL;
+ break;
+ }
+ num++;
+
+ bh = NULL;
+ err = ocfs2_read_dir_block(dir, b++, &bh,
+ OCFS2_BH_READAHEAD);
+ bh_use[ra_max] = bh;
+ }
+ }
+ if ((bh = bh_use[ra_ptr++]) == NULL)
+ goto next;
+ if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
+ /* read error, skip block & hope for the best.
+ * ocfs2_read_dir_block() has released the bh. */
+ ocfs2_error(dir->i_sb, "reading directory %llu, "
+ "offset %lu\n",
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ block);
+ goto next;
+ }
+ i = ocfs2_search_dirblock(bh, dir, name, namelen,
+ block << sb->s_blocksize_bits,
+ bh->b_data, sb->s_blocksize,
+ res_dir);
+ if (i == 1) {
+ OCFS2_I(dir)->ip_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+ brelse(bh);
+ if (i < 0)
+ goto cleanup_and_exit;
+ }
+ next:
+ if (++block >= nblocks)
+ block = 0;
+ } while (block != start);
+
+ /*
+ * If the directory has grown while we were searching, then
+ * search the last part of the directory before giving up.
+ */
+ block = nblocks;
+ nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+ if (block < nblocks) {
+ start = 0;
+ goto restart;
+ }
+
+cleanup_and_exit:
+ /* Clean up the read-ahead blocks */
+ for (; ra_ptr < ra_max; ra_ptr++)
+ brelse(bh_use[ra_ptr]);
+
+ trace_ocfs2_find_entry_el(ret);
+ return ret;
+}
+
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ u32 major_hash,
+ u32 *ret_cpos,
+ u64 *ret_phys_blkno,
+ unsigned int *ret_clen)
+{
+ int ret = 0, i, found;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
+ &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "btree tree block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ found = 0;
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0) in btree", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (ret_phys_blkno)
+ *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+ if (ret_cpos)
+ *ret_cpos = le32_to_cpu(rec->e_cpos);
+ if (ret_clen)
+ *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
+
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+/*
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
+ */
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+ u32 minor_hash)
+{
+ return minor_hash & osb->osb_dx_mask;
+}
+
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+ struct ocfs2_dx_hinfo *hinfo)
+{
+ return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
+
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_dx_hinfo *hinfo,
+ u32 *ret_cpos,
+ u64 *ret_phys_blkno)
+{
+ int ret = 0;
+ unsigned int cend, uninitialized_var(clen);
+ u32 uninitialized_var(cpos);
+ u64 uninitialized_var(blkno);
+ u32 name_hash = hinfo->major_hash;
+
+ ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+ &clen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cend = cpos + clen;
+ if (name_hash >= cend) {
+ /* We want the last cluster */
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+ cpos += clen - 1;
+ } else {
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+ name_hash - cpos);
+ cpos = name_hash;
+ }
+
+ /*
+ * We now have the cluster which should hold our entry. To
+ * find the exact block from the start of the cluster to
+ * search, we take the lower bits of the hash.
+ */
+ blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+
+ if (ret_phys_blkno)
+ *ret_phys_blkno = blkno;
+ if (ret_cpos)
+ *ret_cpos = cpos;
+
+out:
+
+ return ret;
+}
+
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+ struct inode *dir,
+ struct ocfs2_dx_root_block *dx_root,
+ struct ocfs2_dir_lookup_result *res)
+{
+ int ret, i, found;
+ u64 uninitialized_var(phys);
+ struct buffer_head *dx_leaf_bh = NULL;
+ struct ocfs2_dx_leaf *dx_leaf;
+ struct ocfs2_dx_entry *dx_entry = NULL;
+ struct buffer_head *dir_ent_bh = NULL;
+ struct ocfs2_dir_entry *dir_ent = NULL;
+ struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+ struct ocfs2_extent_list *dr_el;
+ struct ocfs2_dx_entry_list *entry_list;
+
+ ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+
+ if (ocfs2_dx_root_inline(dx_root)) {
+ entry_list = &dx_root->dr_entries;
+ goto search;
+ }
+
+ dr_el = &dx_root->dr_list;
+
+ ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ namelen, name, hinfo->major_hash,
+ hinfo->minor_hash, (unsigned long long)phys);
+
+ ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+
+ trace_ocfs2_dx_dir_search_leaf_info(
+ le16_to_cpu(dx_leaf->dl_list.de_num_used),
+ le16_to_cpu(dx_leaf->dl_list.de_count));
+
+ entry_list = &dx_leaf->dl_list;
+
+search:
+ /*
+ * Empty leaf is legal, so no need to check for that.
+ */
+ found = 0;
+ for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+ dx_entry = &entry_list->de_entries[i];
+
+ if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+ || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+ continue;
+
+ /*
+ * Search unindexed leaf block now. We're not
+ * guaranteed to find anything.
+ */
+ ret = ocfs2_read_dir_block_direct(dir,
+ le64_to_cpu(dx_entry->dx_dirent_blk),
+ &dir_ent_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * XXX: We should check the unindexed block here,
+ * before using it.
+ */
+
+ found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+ 0, dir_ent_bh->b_data,
+ dir->i_sb->s_blocksize, &dir_ent);
+ if (found == 1)
+ break;
+
+ if (found == -1) {
+ /* This means we found a bad directory entry. */
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ brelse(dir_ent_bh);
+ dir_ent_bh = NULL;
+ }
+
+ if (found <= 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ res->dl_leaf_bh = dir_ent_bh;
+ res->dl_entry = dir_ent;
+ res->dl_dx_leaf_bh = dx_leaf_bh;
+ res->dl_dx_entry = dx_entry;
+
+ ret = 0;
+out:
+ if (ret) {
+ brelse(dx_leaf_bh);
+ brelse(dir_ent_bh);
+ }
+ return ret;
+}
+
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+ struct inode *dir,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct buffer_head *dx_root_bh = NULL;
+ struct ocfs2_dx_root_block *dx_root;
+
+ ret = ocfs2_read_inode_block(dir, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+ ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+ if (ret) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ lookup->dl_dx_root_bh = dx_root_bh;
+ dx_root_bh = NULL;
+out:
+ brelse(di_bh);
+ brelse(dx_root_bh);
+ return ret;
+}
+
+/*
+ * Try to find an entry of the provided name within 'dir'.
+ *
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
+ *
+ * Caller can NOT assume anything about the contents of the
+ * buffer_heads - they are passed back only so that it can be passed
+ * into any one of the manipulation functions (add entry, delete
+ * entry, etc). As an example, bh in the extent directory case is a
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
+ */
+int ocfs2_find_entry(const char *name, int namelen,
+ struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
+{
+ struct buffer_head *bh;
+ struct ocfs2_dir_entry *res_dir = NULL;
+
+ if (ocfs2_dir_indexed(dir))
+ return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+
+ /*
+ * The unindexed dir code only uses part of the lookup
+ * structure, so there's no reason to push it down further
+ * than this.
+ */
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+ else
+ bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+
+ if (bh == NULL)
+ return -ENOENT;
+
+ lookup->dl_leaf_bh = bh;
+ lookup->dl_entry = res_dir;
+ return 0;
+}
+
+/*
+ * Update inode number and type of a previously found directory entry.
+ */
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+ struct ocfs2_dir_lookup_result *res,
+ struct inode *new_entry_inode)
+{
+ int ret;
+ ocfs2_journal_access_func access = ocfs2_journal_access_db;
+ struct ocfs2_dir_entry *de = res->dl_entry;
+ struct buffer_head *de_bh = res->dl_leaf_bh;
+
+ /*
+ * The same code works fine for both inline-data and extent
+ * based directories, so no need to split this up. The only
+ * difference is the journal_access function.
+ */
+
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ access = ocfs2_journal_access_di;
+
+ ret = access(handle, INODE_CACHE(dir), de_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
+ ocfs2_set_de_type(de, new_entry_inode->i_mode);
+
+ ocfs2_journal_dirty(handle, de_bh);
+
+out:
+ return ret;
+}
+
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
+ struct ocfs2_dir_entry *de_del,
+ struct buffer_head *bh, char *first_de,
+ unsigned int bytes)
+{
+ struct ocfs2_dir_entry *de, *pde;
+ int i, status = -ENOENT;
+ ocfs2_journal_access_func access = ocfs2_journal_access_db;
+
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ access = ocfs2_journal_access_di;
+
+ i = 0;
+ pde = NULL;
+ de = (struct ocfs2_dir_entry *) first_de;
+ while (i < bytes) {
+ if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ if (de == de_del) {
+ status = access(handle, INODE_CACHE(dir), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ if (pde)
+ le16_add_cpu(&pde->rec_len,
+ le16_to_cpu(de->rec_len));
+ de->inode = 0;
+ dir->i_version++;
+ ocfs2_journal_dirty(handle, bh);
+ goto bail;
+ }
+ i += le16_to_cpu(de->rec_len);
+ pde = de;
+ de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+ }
+bail:
+ return status;
+}
+
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+ unsigned int hole;
+
+ if (le64_to_cpu(de->inode) == 0)
+ hole = le16_to_cpu(de->rec_len);
+ else
+ hole = le16_to_cpu(de->rec_len) -
+ OCFS2_DIR_REC_LEN(de->name_len);
+
+ return hole;
+}
+
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+ struct buffer_head *dirblock_bh)
+{
+ int size, this_hole, largest_hole = 0;
+ char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+ struct ocfs2_dir_entry *de;
+
+ trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+ size = ocfs2_dir_trailer_blk_off(sb);
+ limit = start + size;
+ de_buf = start;
+ de = (struct ocfs2_dir_entry *)de_buf;
+ do {
+ if (de_buf != trailer) {
+ this_hole = ocfs2_figure_dirent_hole(de);
+ if (this_hole > largest_hole)
+ largest_hole = this_hole;
+ }
+
+ de_buf += le16_to_cpu(de->rec_len);
+ de = (struct ocfs2_dir_entry *)de_buf;
+ } while (de_buf < limit);
+
+ if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+ return largest_hole;
+ return 0;
+}
+
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+ int index)
+{
+ int num_used = le16_to_cpu(entry_list->de_num_used);
+
+ if (num_used == 1 || index == (num_used - 1))
+ goto clear;
+
+ memmove(&entry_list->de_entries[index],
+ &entry_list->de_entries[index + 1],
+ (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+ num_used--;
+ memset(&entry_list->de_entries[num_used], 0,
+ sizeof(struct ocfs2_dx_entry));
+ entry_list->de_num_used = cpu_to_le16(num_used);
+}
+
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret, index, max_rec_len, add_to_free_list = 0;
+ struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+ struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+ struct ocfs2_dx_leaf *dx_leaf;
+ struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+ struct ocfs2_dir_block_trailer *trailer;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dx_entry_list *entry_list;
+
+ /*
+ * This function gets a bit messy because we might have to
+ * modify the root block, regardless of whether the indexed
+ * entries are stored inline.
+ */
+
+ /*
+ * *Only* set 'entry_list' here, based on where we're looking
+ * for the indexed entries. Later, we might still want to
+ * journal both blocks, based on free list state.
+ */
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+ if (ocfs2_dx_root_inline(dx_root)) {
+ entry_list = &dx_root->dr_entries;
+ } else {
+ dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+ entry_list = &dx_leaf->dl_list;
+ }
+
+ /* Neither of these are a disk corruption - that should have
+ * been caught by lookup, before we got here. */
+ BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+ BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+
+ index = (char *)dx_entry - (char *)entry_list->de_entries;
+ index /= sizeof(*dx_entry);
+
+ if (index >= le16_to_cpu(entry_list->de_num_used)) {
+ mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+ entry_list, dx_entry);
+ return -EIO;
+ }
+
+ /*
+ * We know that removal of this dirent will leave enough room
+ * for a new one, so add this block to the free list if it
+ * isn't already there.
+ */
+ trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+ if (trailer->db_free_rec_len == 0)
+ add_to_free_list = 1;
+
+ /*
+ * Add the block holding our index into the journal before
+ * removing the unindexed entry. If we get an error return
+ * from __ocfs2_delete_entry(), then it hasn't removed the
+ * entry yet. Likewise, successful return means we *must*
+ * remove the indexed entry.
+ *
+ * We're also careful to journal the root tree block here as
+ * the entry count needs to be updated. Also, we might be
+ * adding to the start of the free list.
+ */
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!ocfs2_dx_root_inline(dx_root)) {
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ lookup->dl_dx_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ index);
+
+ ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+ leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+ trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+ if (add_to_free_list) {
+ trailer->db_free_next = dx_root->dr_free_blk;
+ dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+ ocfs2_journal_dirty(handle, dx_root_bh);
+ }
+
+ /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+ ocfs2_journal_dirty(handle, leaf_bh);
+
+ le32_add_cpu(&dx_root->dr_num_entries, -1);
+ ocfs2_journal_dirty(handle, dx_root_bh);
+
+ ocfs2_dx_list_remove_entry(entry_list, index);
+
+ if (!ocfs2_dx_root_inline(dx_root))
+ ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+
+out:
+ return ret;
+}
+
+static inline int ocfs2_delete_entry_id(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_dir_entry *de_del,
+ struct buffer_head *bh)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_inline_data *data;
+
+ ret = ocfs2_read_inode_block(dir, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ data = &di->id2.i_data;
+
+ ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
+ i_size_read(dir));
+
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+static inline int ocfs2_delete_entry_el(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_dir_entry *de_del,
+ struct buffer_head *bh)
+{
+ return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
+ bh->b_size);
+}
+
+/*
+ * Delete a directory entry. Hide the details of directory
+ * implementation from the caller.
+ */
+int ocfs2_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_dir_lookup_result *res)
+{
+ if (ocfs2_dir_indexed(dir))
+ return ocfs2_delete_entry_dx(handle, dir, res);
+
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+ res->dl_leaf_bh);
+
+ return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+ res->dl_leaf_bh);
+}
+
+/*
+ * Check whether 'de' has enough room to hold an entry of
+ * 'new_rec_len' bytes.
+ */
+static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
+ unsigned int new_rec_len)
+{
+ unsigned int de_really_used;
+
+ /* Check whether this is an empty record with enough space */
+ if (le64_to_cpu(de->inode) == 0 &&
+ le16_to_cpu(de->rec_len) >= new_rec_len)
+ return 1;
+
+ /*
+ * Record might have free space at the end which we can
+ * use.
+ */
+ de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
+ if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
+ return 1;
+
+ return 0;
+}
+
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+ struct ocfs2_dx_entry *dx_new_entry)
+{
+ int i;
+
+ i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+ dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
+
+ le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
+
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+ struct ocfs2_dx_hinfo *hinfo,
+ u64 dirent_blk)
+{
+ int i;
+ struct ocfs2_dx_entry *dx_entry;
+
+ i = le16_to_cpu(entry_list->de_num_used);
+ dx_entry = &entry_list->de_entries[i];
+
+ memset(dx_entry, 0, sizeof(*dx_entry));
+ dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+ dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+ dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+
+ le16_add_cpu(&entry_list->de_num_used, 1);
+}
+
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+ struct ocfs2_dx_hinfo *hinfo,
+ u64 dirent_blk,
+ struct buffer_head *dx_leaf_bh)
+{
+ int ret;
+ struct ocfs2_dx_leaf *dx_leaf;
+
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+ ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+ ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+out:
+ return ret;
+}
+
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+ struct ocfs2_dx_hinfo *hinfo,
+ u64 dirent_blk,
+ struct ocfs2_dx_root_block *dx_root)
+{
+ ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret = 0;
+ struct ocfs2_dx_root_block *dx_root;
+ struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+ if (ocfs2_dx_root_inline(dx_root)) {
+ ocfs2_dx_inline_root_insert(dir, handle,
+ &lookup->dl_hinfo,
+ lookup->dl_leaf_bh->b_blocknr,
+ dx_root);
+ } else {
+ ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+ lookup->dl_leaf_bh->b_blocknr,
+ lookup->dl_dx_leaf_bh);
+ if (ret)
+ goto out;
+ }
+
+ le32_add_cpu(&dx_root->dr_num_entries, 1);
+ ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+ return ret;
+}
+
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+ handle_t *handle,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ struct ocfs2_dir_block_trailer *trailer, *prev;
+ struct ocfs2_dx_root_block *dx_root;
+ struct buffer_head *bh;
+
+ trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+
+ if (ocfs2_free_list_at_root(lookup)) {
+ bh = lookup->dl_dx_root_bh;
+ dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+ dx_root->dr_free_blk = trailer->db_free_next;
+ } else {
+ bh = lookup->dl_prev_leaf_bh;
+ prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+ prev->db_free_next = trailer->db_free_next;
+ }
+
+ trailer->db_free_rec_len = cpu_to_le16(0);
+ trailer->db_free_next = cpu_to_le64(0);
+
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int max_rec_len;
+ struct ocfs2_dir_block_trailer *trailer;
+
+ /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+ max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+ if (max_rec_len) {
+ /*
+ * There's still room in this block, so no need to remove it
+ * from the free list. In this case, we just want to update
+ * the rec len accounting.
+ */
+ trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+ trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+ ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+ } else {
+ ocfs2_remove_block_from_free_list(dir, handle, lookup);
+ }
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * The lookup context must have been filled from
+ * ocfs2_prepare_dir_for_insert.
+ */
+int __ocfs2_add_entry(handle_t *handle,
+ struct inode *dir,
+ const char *name, int namelen,
+ struct inode *inode, u64 blkno,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ unsigned long offset;
+ unsigned short rec_len;
+ struct ocfs2_dir_entry *de, *de1;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+ struct super_block *sb = dir->i_sb;
+ int retval, status;
+ unsigned int size = sb->s_blocksize;
+ struct buffer_head *insert_bh = lookup->dl_leaf_bh;
+ char *data_start = insert_bh->b_data;
+
+ if (!namelen)
+ return -EINVAL;
+
+ if (ocfs2_dir_indexed(dir)) {
+ struct buffer_head *bh;
+
+ /*
+ * An indexed dir may require that we update the free space
+ * list. Reserve a write to the previous node in the list so
+ * that we don't fail later.
+ *
+ * XXX: This can be either a dx_root_block, or an unindexed
+ * directory tree leaf block.
+ */
+ if (ocfs2_free_list_at_root(lookup)) {
+ bh = lookup->dl_dx_root_bh;
+ retval = ocfs2_journal_access_dr(handle,
+ INODE_CACHE(dir), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ } else {
+ bh = lookup->dl_prev_leaf_bh;
+ retval = ocfs2_journal_access_db(handle,
+ INODE_CACHE(dir), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ }
+ if (retval) {
+ mlog_errno(retval);
+ return retval;
+ }
+ } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ data_start = di->id2.i_data.id_data;
+ size = i_size_read(dir);
+
+ BUG_ON(insert_bh != parent_fe_bh);
+ }
+
+ rec_len = OCFS2_DIR_REC_LEN(namelen);
+ offset = 0;
+ de = (struct ocfs2_dir_entry *) data_start;
+ while (1) {
+ BUG_ON((char *)de >= (size + data_start));
+
+ /* These checks should've already been passed by the
+ * prepare function, but I guess we can leave them
+ * here anyway. */
+ if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+ retval = -ENOENT;
+ goto bail;
+ }
+ if (ocfs2_match(namelen, name, de)) {
+ retval = -EEXIST;
+ goto bail;
+ }
+
+ /* We're guaranteed that we should have space, so we
+ * can't possibly have hit the trailer...right? */
+ mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+ "Hit dir trailer trying to insert %.*s "
+ "(namelen %d) into directory %llu. "
+ "offset is %lu, trailer offset is %d\n",
+ namelen, name, namelen,
+ (unsigned long long)parent_fe_bh->b_blocknr,
+ offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
+ if (ocfs2_dirent_would_fit(de, rec_len)) {
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+ if (retval < 0) {
+ mlog_errno(retval);
+ goto bail;
+ }
+
+ if (insert_bh == parent_fe_bh)
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(dir),
+ insert_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ else {
+ status = ocfs2_journal_access_db(handle,
+ INODE_CACHE(dir),
+ insert_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+
+ if (ocfs2_dir_indexed(dir)) {
+ status = ocfs2_dx_dir_insert(dir,
+ handle,
+ lookup);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ }
+
+ /* By now the buffer is marked for journaling */
+ offset += le16_to_cpu(de->rec_len);
+ if (le64_to_cpu(de->inode)) {
+ de1 = (struct ocfs2_dir_entry *)((char *) de +
+ OCFS2_DIR_REC_LEN(de->name_len));
+ de1->rec_len =
+ cpu_to_le16(le16_to_cpu(de->rec_len) -
+ OCFS2_DIR_REC_LEN(de->name_len));
+ de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+ de = de1;
+ }
+ de->file_type = OCFS2_FT_UNKNOWN;
+ if (blkno) {
+ de->inode = cpu_to_le64(blkno);
+ ocfs2_set_de_type(de, inode->i_mode);
+ } else
+ de->inode = 0;
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+
+ if (ocfs2_dir_indexed(dir))
+ ocfs2_recalc_free_list(dir, handle, lookup);
+
+ dir->i_version++;
+ ocfs2_journal_dirty(handle, insert_bh);
+ retval = 0;
+ goto bail;
+ }
+
+ offset += le16_to_cpu(de->rec_len);
+ de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+ }
+
+ /* when you think about it, the assert above should prevent us
+ * from ever getting here. */
+ retval = -ENOSPC;
+bail:
+ if (retval)
+ mlog_errno(retval);
+
+ return retval;
+}
+
+static int ocfs2_dir_foreach_blk_id(struct inode *inode,
+ u64 *f_version,
+ struct dir_context *ctx)
+{
+ int ret, i;
+ unsigned long offset = ctx->pos;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_inline_data *data;
+ struct ocfs2_dir_entry *de;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ goto out;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ data = &di->id2.i_data;
+
+ while (ctx->pos < i_size_read(inode)) {
+ /* If the dir block has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the block
+ * to make sure. */
+ if (*f_version != inode->i_version) {
+ for (i = 0; i < i_size_read(inode) && i < offset; ) {
+ de = (struct ocfs2_dir_entry *)
+ (data->id_data + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (le16_to_cpu(de->rec_len) <
+ OCFS2_DIR_REC_LEN(1))
+ break;
+ i += le16_to_cpu(de->rec_len);
+ }
+ ctx->pos = offset = i;
+ *f_version = inode->i_version;
+ }
+
+ de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+ if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
+ /* On error, skip the f_pos to the end. */
+ ctx->pos = i_size_read(inode);
+ break;
+ }
+ offset += le16_to_cpu(de->rec_len);
+ if (le64_to_cpu(de->inode)) {
+ unsigned char d_type = DT_UNKNOWN;
+
+ if (de->file_type < OCFS2_FT_MAX)
+ d_type = ocfs2_filetype_table[de->file_type];
+
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), d_type))
+ goto out;
+ }
+ ctx->pos += le16_to_cpu(de->rec_len);
+ }
+out:
+ brelse(di_bh);
+ return 0;
+}
+
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
+static int ocfs2_dir_foreach_blk_el(struct inode *inode,
+ u64 *f_version,
+ struct dir_context *ctx,
+ bool persist)
{
- int error = 0;
unsigned long offset, blk, last_ra_blk = 0;
- int i, stored;
+ int i;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
- int err;
- struct inode *inode = filp->f_dentry->d_inode;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
+ int stored = 0;
- mlog_entry("dirino=%llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
- stored = 0;
bh = NULL;
- error = ocfs2_meta_lock(inode, NULL, NULL, 0);
- if (error < 0) {
- if (error != -ENOENT)
- mlog_errno(error);
- /* we haven't got any yet, so propagate the error. */
- stored = error;
- goto bail_nolock;
- }
-
- offset = filp->f_pos & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && filp->f_pos < i_size_read(inode)) {
- blk = (filp->f_pos) >> sb->s_blocksize_bits;
- bh = ocfs2_bread(inode, blk, &err, 0);
- if (!bh) {
- mlog(ML_ERROR,
- "directory #%llu contains a hole at offset %lld\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- filp->f_pos);
- filp->f_pos += sb->s_blocksize - offset;
+ while (ctx->pos < i_size_read(inode)) {
+ blk = ctx->pos >> sb->s_blocksize_bits;
+ if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+ /* Skip the corrupt dirblock and keep trying */
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -121,20 +1866,20 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
|| (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
i > 0; i--) {
- tmp = ocfs2_bread(inode, ++blk, &err, 1);
- if (tmp)
+ tmp = NULL;
+ if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+ OCFS2_BH_READAHEAD))
brelse(tmp);
}
last_ra_blk = blk;
ra_sectors = 8;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (filp->f_version != inode->i_version) {
+ if (*f_version != inode->i_version) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ocfs2_dir_entry *) (bh->b_data + i);
/* It's too expensive to do a full
@@ -149,61 +1894,104 @@ revalidate:
i += le16_to_cpu(de->rec_len);
}
offset = i;
- filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- filp->f_version = inode->i_version;
+ *f_version = inode->i_version;
}
- while (!error && filp->f_pos < i_size_read(inode)
+ while (ctx->pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
/* On error, skip the f_pos to the
next block. */
- filp->f_pos = (filp->f_pos |
- (sb->s_blocksize - 1)) + 1;
+ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
brelse(bh);
- goto bail;
+ continue;
}
- offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- unsigned long version = filp->f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- error = filldir(dirent, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- filp->f_pos,
- ino_from_blkno(sb, le64_to_cpu(de->inode)),
- d_type);
- if (error)
- break;
- if (version != filp->f_version)
- goto revalidate;
- stored ++;
+ le64_to_cpu(de->inode),
+ d_type)) {
+ brelse(bh);
+ return 0;
+ }
+ stored++;
}
- filp->f_pos += le16_to_cpu(de->rec_len);
+ offset += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
offset = 0;
brelse(bh);
+ bh = NULL;
+ if (!persist && stored)
+ break;
}
+ return 0;
+}
- stored = 0;
-bail:
- ocfs2_meta_unlock(inode, 0);
+static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
+ struct dir_context *ctx,
+ bool persist)
+{
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+ return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
+}
+
+/*
+ * This is intended to be called from inside other kernel functions,
+ * so we fake some arguments.
+ */
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
+{
+ u64 version = inode->i_version;
+ ocfs2_dir_foreach_blk(inode, &version, ctx, true);
+ return 0;
+}
+
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
+{
+ int error = 0;
+ struct inode *inode = file_inode(file);
+ int lock_level = 0;
+
+ trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
+ if (lock_level && error >= 0) {
+ /* We release EX lock which used to update atime
+ * and get PR lock again to reduce contention
+ * on commonly accessed directories. */
+ ocfs2_inode_unlock(inode, 1);
+ lock_level = 0;
+ error = ocfs2_inode_lock(inode, NULL, 0);
+ }
+ if (error < 0) {
+ if (error != -ENOENT)
+ mlog_errno(error);
+ /* we haven't got any yet, so propagate the error. */
+ goto bail_nolock;
+ }
+
+ error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
+
+ ocfs2_inode_unlock(inode, lock_level);
+ if (error)
+ mlog_errno(error);
bail_nolock:
- mlog_exit(stored);
- return stored;
+ return error;
}
/*
@@ -213,36 +2001,41 @@ int ocfs2_find_files_on_disk(const char *name,
int namelen,
u64 *blkno,
struct inode *inode,
- struct buffer_head **dirent_bh,
- struct ocfs2_dir_entry **dirent)
+ struct ocfs2_dir_lookup_result *lookup)
{
int status = -ENOENT;
- mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
- namelen, name, blkno, inode, dirent_bh, dirent);
+ trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
- *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
- if (!*dirent_bh || !*dirent) {
- status = -ENOENT;
+ status = ocfs2_find_entry(name, namelen, inode, lookup);
+ if (status)
goto leave;
- }
- *blkno = le64_to_cpu((*dirent)->inode);
+ *blkno = le64_to_cpu(lookup->dl_entry->inode);
status = 0;
leave:
- if (status < 0) {
- *dirent = NULL;
- if (*dirent_bh) {
- brelse(*dirent_bh);
- *dirent_bh = NULL;
- }
- }
- mlog_exit(status);
return status;
}
+/*
+ * Convenience function for callers which just want the block number
+ * mapped to a name and don't require the full dirent info, etc.
+ */
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+ int namelen, u64 *blkno)
+{
+ int ret;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+ ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
+ ocfs2_free_dir_lookup_result(&lookup);
+
+ return ret;
+}
+
/* Check for a name within a directory.
*
* Return 0 if the name does not exist
@@ -255,110 +2048,1103 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
int namelen)
{
int ret;
- struct buffer_head *dirent_bh = NULL;
- struct ocfs2_dir_entry *dirent = NULL;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry("dir %llu, name '%.*s'\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+ trace_ocfs2_check_dir_for_entry(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
ret = -EEXIST;
- dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
- if (dirent_bh)
+ if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
goto bail;
ret = 0;
bail:
- if (dirent_bh)
- brelse(dirent_bh);
+ ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
+ return ret;
+}
+
+struct ocfs2_empty_dir_priv {
+ struct dir_context ctx;
+ unsigned seen_dot;
+ unsigned seen_dot_dot;
+ unsigned seen_other;
+ unsigned dx_dir;
+};
+static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
+ loff_t pos, u64 ino, unsigned type)
+{
+ struct ocfs2_empty_dir_priv *p = priv;
+
+ /*
+ * Check the positions of "." and ".." records to be sure
+ * they're in the correct place.
+ *
+ * Indexed directories don't need to proceed past the first
+ * two entries, so we end the scan after seeing '..'. Despite
+ * that, we allow the scan to proceed In the event that we
+ * have a corrupted indexed directory (no dot or dot dot
+ * entries). This allows us to double check for existing
+ * entries which might not have been found in the index.
+ */
+ if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
+ p->seen_dot = 1;
+ return 0;
+ }
+
+ if (name_len == 2 && !strncmp("..", name, 2) &&
+ pos == OCFS2_DIR_REC_LEN(1)) {
+ p->seen_dot_dot = 1;
+
+ if (p->dx_dir && p->seen_dot)
+ return 1;
+
+ return 0;
+ }
+
+ p->seen_other = 1;
+ return 1;
+}
+
+static int ocfs2_empty_dir_dx(struct inode *inode,
+ struct ocfs2_empty_dir_priv *priv)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct buffer_head *dx_root_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_dx_root_block *dx_root;
+
+ priv->dx_dir = 1;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+ if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+ priv->seen_other = 1;
+
+out:
+ brelse(di_bh);
+ brelse(dx_root_bh);
return ret;
}
/*
* routine to check that the specified directory is empty (for rmdir)
+ *
+ * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
*/
int ocfs2_empty_dir(struct inode *inode)
{
- unsigned long offset;
- struct buffer_head * bh;
- struct ocfs2_dir_entry * de, * de1;
- struct super_block * sb;
- int err;
-
- sb = inode->i_sb;
- if ((i_size_read(inode) <
- (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
- !(bh = ocfs2_bread(inode, 0, &err, 0))) {
- mlog(ML_ERROR, "bad directory (dir #%llu) - no data block\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- return 1;
+ int ret;
+ struct ocfs2_empty_dir_priv priv = {
+ .ctx.actor = ocfs2_empty_dir_filldir,
+ };
+
+ if (ocfs2_dir_indexed(inode)) {
+ ret = ocfs2_empty_dir_dx(inode, &priv);
+ if (ret)
+ mlog_errno(ret);
+ /*
+ * We still run ocfs2_dir_foreach to get the checks
+ * for "." and "..".
+ */
}
- de = (struct ocfs2_dir_entry *) bh->b_data;
- de1 = (struct ocfs2_dir_entry *)
- ((char *)de + le16_to_cpu(de->rec_len));
- if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
- !le64_to_cpu(de1->inode) ||
- strcmp(".", de->name) ||
- strcmp("..", de1->name)) {
- mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
+ ret = ocfs2_dir_foreach(inode, &priv.ctx);
+ if (ret)
+ mlog_errno(ret);
+
+ if (!priv.seen_dot || !priv.seen_dot_dot) {
+ mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- brelse(bh);
+ /*
+ * XXX: Is it really safe to allow an unlink to continue?
+ */
return 1;
}
- offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
- de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
- while (offset < i_size_read(inode) ) {
- if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
- brelse(bh);
- bh = ocfs2_bread(inode,
- offset >> sb->s_blocksize_bits, &err, 0);
- if (!bh) {
- mlog(ML_ERROR, "dir %llu has a hole at %lu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, offset);
- offset += sb->s_blocksize;
- continue;
+
+ return !priv.seen_other;
+}
+
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+ struct inode *parent,
+ char *start,
+ unsigned int size)
+{
+ struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
+
+ de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+ de->name_len = 1;
+ de->rec_len =
+ cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+ strcpy(de->name, ".");
+ ocfs2_set_de_type(de, S_IFDIR);
+
+ de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+ de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+ de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
+ de->name_len = 2;
+ strcpy(de->name, "..");
+ ocfs2_set_de_type(de, S_IFDIR);
+
+ return de;
+}
+
+/*
+ * This works together with code in ocfs2_mknod_locked() which sets
+ * the inline-data flag and initializes the inline-data section.
+ */
+static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *parent,
+ struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inline_data *data = &di->id2.i_data;
+ unsigned int size = le16_to_cpu(data->id_count);
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
+ ocfs2_journal_dirty(handle, di_bh);
+
+ i_size_write(inode, size);
+ set_nlink(inode, 2);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+
+ ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *parent,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct buffer_head **ret_new_bh)
+{
+ int status;
+ unsigned int size = osb->sb->s_blocksize;
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_dir_entry *de;
+
+ if (ocfs2_new_dir_wants_trailer(inode))
+ size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
+ status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+ data_ac, NULL, &new_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+ status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+ de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+ if (ocfs2_new_dir_wants_trailer(inode)) {
+ int size = le16_to_cpu(de->rec_len);
+
+ /*
+ * Figure out the size of the hole left over after
+ * insertion of '.' and '..'. The trailer wants this
+ * information.
+ */
+ size -= OCFS2_DIR_REC_LEN(2);
+ size -= sizeof(struct ocfs2_dir_block_trailer);
+
+ ocfs2_init_dir_trailer(inode, new_bh, size);
+ }
+
+ ocfs2_journal_dirty(handle, new_bh);
+
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ set_nlink(inode, 2);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = 0;
+ if (ret_new_bh) {
+ *ret_new_bh = new_bh;
+ new_bh = NULL;
+ }
+bail:
+ brelse(new_bh);
+
+ return status;
+}
+
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+ handle_t *handle, struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dirdata_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ int dx_inline, u32 num_entries,
+ struct buffer_head **ret_dx_root_bh)
+{
+ int ret;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ u16 dr_suballoc_bit;
+ u64 suballoc_loc, dr_blkno;
+ unsigned int num_bits;
+ struct buffer_head *dx_root_bh = NULL;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &dr_suballoc_bit, &num_bits, &dr_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_dx_dir_attach_index(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)dr_blkno);
+
+ dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+ if (dx_root_bh == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
+
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+ memset(dx_root, 0, osb->sb->s_blocksize);
+ strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+ dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
+ dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+ dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+ dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+ dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+ dx_root->dr_num_entries = cpu_to_le32(num_entries);
+ if (le16_to_cpu(trailer->db_free_rec_len))
+ dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+ else
+ dx_root->dr_free_blk = cpu_to_le64(0);
+
+ if (dx_inline) {
+ dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+ dx_root->dr_entries.de_count =
+ cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+ } else {
+ dx_root->dr_list.l_count =
+ cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+ }
+ ocfs2_journal_dirty(handle, dx_root_bh);
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ di->i_dx_root = cpu_to_le64(dr_blkno);
+
+ spin_lock(&OCFS2_I(dir)->ip_lock);
+ OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+ di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ *ret_dx_root_bh = dx_root_bh;
+ dx_root_bh = NULL;
+
+out:
+ brelse(dx_root_bh);
+ return ret;
+}
+
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+ handle_t *handle, struct inode *dir,
+ struct buffer_head **dx_leaves,
+ int num_dx_leaves, u64 start_blk)
+{
+ int ret, i;
+ struct ocfs2_dx_leaf *dx_leaf;
+ struct buffer_head *bh;
+
+ for (i = 0; i < num_dx_leaves; i++) {
+ bh = sb_getblk(osb->sb, start_blk + i);
+ if (bh == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dx_leaves[i] = bh;
+
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
+
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+
+ memset(dx_leaf, 0, osb->sb->s_blocksize);
+ strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+ dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+ dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+ dx_leaf->dl_list.de_count =
+ cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+
+ trace_ocfs2_dx_dir_format_cluster(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(dx_leaf->dl_list.de_count));
+
+ ocfs2_journal_dirty(handle, bh);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+ u32 cpos, handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct buffer_head **dx_leaves,
+ int num_dx_leaves, u64 *ret_phys_blkno)
+{
+ int ret;
+ u32 phys, num;
+ u64 phys_blkno;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+ /*
+ * XXX: For create, this should claim cluster for the index
+ * *before* the unindexed insert so that we have a better
+ * chance of contiguousness as the directory grows in number
+ * of entries.
+ */
+ ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Format the new cluster first. That way, we're inserting
+ * valid data.
+ */
+ phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+ ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+ num_dx_leaves, phys_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *ret_phys_blkno = phys_blkno;
+out:
+ return ret;
+}
+
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **dx_leaves,
+ int num_dx_leaves)
+{
+ int ret;
+ u64 phys_blkno;
+
+ ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+ num_dx_leaves, &phys_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
+ meta_ac);
+ if (ret)
+ mlog_errno(ret);
+out:
+ return ret;
+}
+
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+ int *ret_num_leaves)
+{
+ int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+ struct buffer_head **dx_leaves;
+
+ dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+ GFP_NOFS);
+ if (dx_leaves && ret_num_leaves)
+ *ret_num_leaves = num_dx_leaves;
+
+ return dx_leaves;
+}
+
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *parent,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct buffer_head *leaf_bh = NULL;
+ struct buffer_head *dx_root_bh = NULL;
+ struct ocfs2_dx_hinfo hinfo;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dx_entry_list *entry_list;
+
+ /*
+ * Our strategy is to create the directory as though it were
+ * unindexed, then add the index block. This works with very
+ * little complication since the state of a new directory is a
+ * very well known quantity.
+ *
+ * Essentially, we have two dirents ("." and ".."), in the 1st
+ * block which need indexing. These are easily inserted into
+ * the index block.
+ */
+
+ ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+ data_ac, &leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+ meta_ac, 1, 2, &dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+ entry_list = &dx_root->dr_entries;
+
+ /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+ ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+ ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+ ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+ ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
+
+out:
+ brelse(dx_root_bh);
+ brelse(leaf_bh);
+ return ret;
+}
+
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *parent,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac)
+
+{
+ BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+
+ if (ocfs2_supports_indexed_dirs(osb))
+ return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+ data_ac, meta_ac);
+
+ return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
+ data_ac, NULL);
+}
+
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+ handle_t *handle,
+ struct buffer_head **dx_leaves,
+ int num_dx_leaves,
+ u32 *num_dx_entries,
+ struct buffer_head *dirent_bh)
+{
+ int ret = 0, namelen, i;
+ char *de_buf, *limit;
+ struct ocfs2_dir_entry *de;
+ struct buffer_head *dx_leaf_bh;
+ struct ocfs2_dx_hinfo hinfo;
+ u64 dirent_blk = dirent_bh->b_blocknr;
+
+ de_buf = dirent_bh->b_data;
+ limit = de_buf + dir->i_sb->s_blocksize;
+
+ while (de_buf < limit) {
+ de = (struct ocfs2_dir_entry *)de_buf;
+
+ namelen = de->name_len;
+ if (!namelen || !de->inode)
+ goto inc;
+
+ ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+
+ i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+ dx_leaf_bh = dx_leaves[i];
+
+ ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+ dirent_blk, dx_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *num_dx_entries = *num_dx_entries + 1;
+
+inc:
+ de_buf += le16_to_cpu(de->rec_len);
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * XXX: This expects dx_root_bh to already be part of the transaction.
+ */
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+ struct buffer_head *dx_root_bh,
+ struct buffer_head *dirent_bh)
+{
+ char *de_buf, *limit;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dir_entry *de;
+ struct ocfs2_dx_hinfo hinfo;
+ u64 dirent_blk = dirent_bh->b_blocknr;
+
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+ de_buf = dirent_bh->b_data;
+ limit = de_buf + dir->i_sb->s_blocksize;
+
+ while (de_buf < limit) {
+ de = (struct ocfs2_dir_entry *)de_buf;
+
+ if (!de->name_len || !de->inode)
+ goto inc;
+
+ ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+
+ trace_ocfs2_dx_dir_index_root_block(
+ (unsigned long long)dir->i_ino,
+ hinfo.major_hash, hinfo.minor_hash,
+ de->name_len, de->name,
+ le16_to_cpu(dx_root->dr_entries.de_num_used));
+
+ ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+ dirent_blk);
+
+ le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+ de_buf += le16_to_cpu(de->rec_len);
+ }
+}
+
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+ struct buffer_head *di_bh)
+{
+ int dirent_count = 0;
+ char *de_buf, *limit;
+ struct ocfs2_dir_entry *de;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ de_buf = di->id2.i_data.id_data;
+ limit = de_buf + i_size_read(dir);
+
+ while (de_buf < limit) {
+ de = (struct ocfs2_dir_entry *)de_buf;
+
+ if (de->name_len && de->inode)
+ dirent_count++;
+
+ de_buf += le16_to_cpu(de->rec_len);
+ }
+
+ /* We are careful to leave room for one extra record. */
+ return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
+}
+
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+ struct inode *dir)
+{
+ struct super_block *sb = dir->i_sb;
+ struct ocfs2_dir_entry *de;
+ struct ocfs2_dir_entry *prev_de;
+ char *de_buf, *limit;
+ unsigned int new_size = sb->s_blocksize;
+ unsigned int bytes, this_hole;
+ unsigned int largest_hole = 0;
+
+ if (ocfs2_new_dir_wants_trailer(dir))
+ new_size = ocfs2_dir_trailer_blk_off(sb);
+
+ bytes = new_size - old_size;
+
+ limit = start + old_size;
+ de_buf = start;
+ de = (struct ocfs2_dir_entry *)de_buf;
+ do {
+ this_hole = ocfs2_figure_dirent_hole(de);
+ if (this_hole > largest_hole)
+ largest_hole = this_hole;
+
+ prev_de = de;
+ de_buf += le16_to_cpu(de->rec_len);
+ de = (struct ocfs2_dir_entry *)de_buf;
+ } while (de_buf < limit);
+
+ le16_add_cpu(&prev_de->rec_len, bytes);
+
+ /* We need to double check this after modification of the final
+ * dirent. */
+ this_hole = ocfs2_figure_dirent_hole(prev_de);
+ if (this_hole > largest_hole)
+ largest_hole = this_hole;
+
+ if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+ return largest_hole;
+ return 0;
+}
+
+/*
+ * We allocate enough clusters to fulfill "blocks_wanted", but set
+ * i_size to exactly one block. Ocfs2_extend_dir() will handle the
+ * rest automatically for us.
+ *
+ * *first_block_bh is a pointer to the 1st data block allocated to the
+ * directory.
+ */
+static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
+ unsigned int blocks_wanted,
+ struct ocfs2_dir_lookup_result *lookup,
+ struct buffer_head **first_block_bh)
+{
+ u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
+ struct super_block *sb = dir->i_sb;
+ int ret, i, num_dx_leaves = 0, dx_inline = 0,
+ credits = ocfs2_inline_to_extents_credits(sb);
+ u64 dx_insert_blkno, blkno,
+ bytes = blocks_wanted << sb->s_blocksize_bits;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(dir);
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct buffer_head *dirdata_bh = NULL;
+ struct buffer_head *dx_root_bh = NULL;
+ struct buffer_head **dx_leaves = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ handle_t *handle;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_extent_tree dx_et;
+ int did_quota = 0, bytes_allocated = 0;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
+
+ alloc = ocfs2_clusters_for_bytes(sb, bytes);
+ dx_alloc = 0;
+
+ down_write(&oi->ip_alloc_sem);
+
+ if (ocfs2_supports_indexed_dirs(osb)) {
+ credits += ocfs2_add_dir_index_credits(sb);
+
+ dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+ if (!dx_inline) {
+ /* Add one more cluster for an index leaf */
+ dx_alloc++;
+ dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+ &num_dx_leaves);
+ if (!dx_leaves) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
- de = (struct ocfs2_dir_entry *) bh->b_data;
}
- if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
- brelse(bh);
- return 1;
+
+ /* This gets us the dx_root */
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- if (le64_to_cpu(de->inode)) {
- brelse(bh);
- return 0;
+ }
+
+ /*
+ * We should never need more than 2 clusters for the unindexed
+ * tree - maximum dirent size is far less than one block. In
+ * fact, the only time we'd need more than one cluster is if
+ * blocksize == clustersize and the dirent won't fit in the
+ * extra space that the expansion to a single block gives. As
+ * of today, that only happens on 4k/4k file systems.
+ */
+ BUG_ON(alloc > 2);
+
+ ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Prepare for worst case allocation scenario of two separate
+ * extents in the unindexed tree.
+ */
+ if (alloc == 2)
+ credits += OCFS2_SUBALLOC_ALLOC;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+ if (ret)
+ goto out_commit;
+ did_quota = 1;
+
+ if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+ /*
+ * Allocate our index cluster first, to maximize the
+ * possibility that unindexed leaves grow
+ * contiguously.
+ */
+ ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+ dx_leaves, num_dx_leaves,
+ &dx_insert_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
}
- offset += le16_to_cpu(de->rec_len);
- de = (struct ocfs2_dir_entry *)
- ((char *)de + le16_to_cpu(de->rec_len));
+ bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
}
- brelse(bh);
- return 1;
+
+ /*
+ * Try to claim as many clusters as the bitmap can give though
+ * if we only get one now, that's enough to continue. The rest
+ * will be claimed after the conversion to extents.
+ */
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &oi->ip_la_data_resv;
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+
+ /*
+ * Operations are carefully ordered so that we set up the new
+ * data block first. The conversion from inline data to
+ * extents follows.
+ */
+ blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+ dirdata_bh = sb_getblk(sb, blkno);
+ if (!dirdata_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
+
+ ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
+ memset(dirdata_bh->b_data + i_size_read(dir), 0,
+ sb->s_blocksize - i_size_read(dir));
+ i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
+ if (ocfs2_new_dir_wants_trailer(dir)) {
+ /*
+ * Prepare the dir trailer up front. It will otherwise look
+ * like a valid dirent. Even if inserting the index fails
+ * (unlikely), then all we'll have done is given first dir
+ * block a small amount of fragmentation.
+ */
+ ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+ }
+
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, dirdata_bh);
+
+ if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+ /*
+ * Dx dirs with an external cluster need to do this up
+ * front. Inline dx root's get handled later, after
+ * we've allocated our root block. We get passed back
+ * a total number of items so that dr_num_entries can
+ * be correctly set once the dx_root has been
+ * allocated.
+ */
+ ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+ num_dx_leaves, &num_dx_entries,
+ dirdata_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ /*
+ * Set extent, i_size, etc on the directory. After this, the
+ * inode should contain the same exact dirents as before and
+ * be fully accessible from system calls.
+ *
+ * We let the later dirent insert modify c/mtime - to the user
+ * the data hasn't changed.
+ */
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ ocfs2_dinode_new_extent_list(dir, di);
+
+ i_size_write(dir, sb->s_blocksize);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+
+ di->i_size = cpu_to_le64(sb->s_blocksize);
+ di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+
+ /*
+ * This should never fail as our extent list is empty and all
+ * related blocks have been journaled already.
+ */
+ ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
+ 0, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * Set i_blocks after the extent insert for the most up to
+ * date ip_clusters value.
+ */
+ dir->i_blocks = ocfs2_inode_sector_count(dir);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ if (ocfs2_supports_indexed_dirs(osb)) {
+ ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+ dirdata_bh, meta_ac, dx_inline,
+ num_dx_entries, &dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (dx_inline) {
+ ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+ dirdata_bh);
+ } else {
+ ocfs2_init_dx_root_extent_tree(&dx_et,
+ INODE_CACHE(dir),
+ dx_root_bh);
+ ret = ocfs2_insert_extent(handle, &dx_et, 0,
+ dx_insert_blkno, 1, 0, NULL);
+ if (ret)
+ mlog_errno(ret);
+ }
+ }
+
+ /*
+ * We asked for two clusters, but only got one in the 1st
+ * pass. Claim the 2nd cluster as a separate extent.
+ */
+ if (alloc > len) {
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
+ &len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+
+ ret = ocfs2_insert_extent(handle, &et, 1,
+ blkno, len, 0, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+ }
+
+ *first_block_bh = dirdata_bh;
+ dirdata_bh = NULL;
+ if (ocfs2_supports_indexed_dirs(osb)) {
+ unsigned int off;
+
+ if (!dx_inline) {
+ /*
+ * We need to return the correct block within the
+ * cluster which should hold our entry.
+ */
+ off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+ &lookup->dl_hinfo);
+ get_bh(dx_leaves[off]);
+ lookup->dl_dx_leaf_bh = dx_leaves[off];
+ }
+ lookup->dl_dx_root_bh = dx_root_bh;
+ dx_root_bh = NULL;
+ }
+
+out_commit:
+ if (ret < 0 && did_quota)
+ dquot_free_space_nodirty(dir, bytes_allocated);
+
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ up_write(&oi->ip_alloc_sem);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ if (dx_leaves) {
+ for (i = 0; i < num_dx_leaves; i++)
+ brelse(dx_leaves[i]);
+ kfree(dx_leaves);
+ }
+
+ brelse(dirdata_bh);
+ brelse(dx_root_bh);
+
+ return ret;
}
/* returns a bh of the 1st new block in the allocation. */
-int ocfs2_do_extend_dir(struct super_block *sb,
- struct ocfs2_journal_handle *handle,
- struct inode *dir,
- struct buffer_head *parent_fe_bh,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head **new_bh)
+static int ocfs2_do_extend_dir(struct super_block *sb,
+ handle_t *handle,
+ struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **new_bh)
{
int status;
- int extend;
- u64 p_blkno;
+ int extend, did_quota = 0;
+ u64 p_blkno, v_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock);
extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
spin_unlock(&OCFS2_I(dir)->ip_lock);
if (extend) {
- status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
- parent_fe_bh, handle,
- data_ac, meta_ac, NULL);
+ u32 offset = OCFS2_I(dir)->ip_clusters;
+
+ status = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(sb, 1));
+ if (status)
+ goto bail;
+ did_quota = 1;
+
+ status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+ 1, 0, parent_fe_bh, handle,
+ data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
mlog_errno(status);
@@ -366,9 +3152,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
}
}
- status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
- (sb->s_blocksize_bits - 9)),
- 1, &p_blkno, NULL);
+ v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+ status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -376,51 +3161,106 @@ int ocfs2_do_extend_dir(struct super_block *sb,
*new_bh = sb_getblk(sb, p_blkno);
if (!*new_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
status = 0;
bail:
- mlog_exit(status);
+ if (did_quota && status < 0)
+ dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
return status;
}
-/* assumes you already have a cluster lock on the directory. */
+/*
+ * Assumes you already have a cluster lock on the directory.
+ *
+ * 'blocks_wanted' is only used if we have an inline directory which
+ * is to be turned into an extent based one. The size of the dirent to
+ * insert might be larger than the space gained by growing to just one
+ * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
+ */
static int ocfs2_extend_dir(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
+ unsigned int blocks_wanted,
+ struct ocfs2_dir_lookup_result *lookup,
struct buffer_head **new_de_bh)
{
int status = 0;
- int credits, num_free_extents;
+ int credits, num_free_extents, drop_alloc_sem = 0;
loff_t dir_i_size;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+ struct ocfs2_extent_list *el = &fe->id2.i_list;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
struct buffer_head *new_bh = NULL;
struct ocfs2_dir_entry * de;
struct super_block *sb = osb->sb;
+ struct ocfs2_extent_tree et;
+ struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
- mlog_entry_void();
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ /*
+ * This would be a code error as an inline directory should
+ * never have an index root.
+ */
+ BUG_ON(dx_root_bh);
- dir_i_size = i_size_read(dir);
- mlog(0, "extending dir %llu (i_size = %lld)\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+ status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
+ blocks_wanted, lookup,
+ &new_bh);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
+ /* Expansion from inline to an indexed directory will
+ * have given us this. */
+ dx_root_bh = lookup->dl_dx_root_bh;
+
+ if (blocks_wanted == 1) {
+ /*
+ * If the new dirent will fit inside the space
+ * created by pushing out to one block, then
+ * we can complete the operation
+ * here. Otherwise we have to expand i_size
+ * and format the 2nd block below.
+ */
+ BUG_ON(new_bh == NULL);
+ goto bail_bh;
+ }
+
+ /*
+ * Get rid of 'new_bh' - we want to format the 2nd
+ * data block and return that instead.
+ */
+ brelse(new_bh);
+ new_bh = NULL;
+
+ down_write(&OCFS2_I(dir)->ip_alloc_sem);
+ drop_alloc_sem = 1;
+ dir_i_size = i_size_read(dir);
+ credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+ goto do_extend;
}
+ down_write(&OCFS2_I(dir)->ip_alloc_sem);
+ drop_alloc_sem = 1;
+ dir_i_size = i_size_read(dir);
+ trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ dir_i_size);
+
/* dir->i_size is always block aligned. */
spin_lock(&OCFS2_I(dir)->ip_lock);
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
spin_unlock(&OCFS2_I(dir)->ip_lock);
- num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
+ parent_fe_bh);
+ num_free_extents = ocfs2_num_free_extents(osb, &et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
@@ -428,8 +3268,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
}
if (!num_free_extents) {
- status = ocfs2_reserve_new_metadata(osb, handle,
- fe, &meta_ac);
+ status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -437,20 +3276,28 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
}
}
- status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+ status = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
- credits = ocfs2_calc_extend_credits(sb, fe, 1);
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
+ credits = ocfs2_calc_extend_credits(sb, el);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
}
- handle = ocfs2_start_trans(osb, handle, credits);
+do_extend:
+ if (ocfs2_dir_indexed(dir))
+ credits++; /* For attaching the new dirent block to the
+ * dx_root */
+
+ handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -465,89 +3312,149 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
- ocfs2_set_new_buffer_uptodate(dir, new_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
- status = ocfs2_journal_access(handle, dir, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(new_bh->b_data, 0, sb->s_blocksize);
+
de = (struct ocfs2_dir_entry *) new_bh->b_data;
de->inode = 0;
- de->rec_len = cpu_to_le16(sb->s_blocksize);
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ if (ocfs2_supports_dir_trailer(dir)) {
+ de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+
+ ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+
+ if (ocfs2_dir_indexed(dir)) {
+ status = ocfs2_dx_dir_link_trailer(dir, handle,
+ dx_root_bh, new_bh);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ } else {
+ de->rec_len = cpu_to_le16(sb->s_blocksize);
}
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, new_bh);
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
- dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+ dir->i_blocks = ocfs2_inode_sector_count(dir);
status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+bail_bh:
*new_de_bh = new_bh;
get_bh(*new_de_bh);
bail:
if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
+ if (drop_alloc_sem)
+ up_write(&OCFS2_I(dir)->ip_alloc_sem);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
- if (new_bh)
- brelse(new_bh);
+ brelse(new_bh);
- mlog_exit(status);
return status;
}
-/*
- * Search the dir for a good spot, extending it if necessary. The
- * block containing an appropriate record is returned in ret_de_bh.
- */
-int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
- struct inode *dir,
- struct buffer_head *parent_fe_bh,
- const char *name,
- int namelen,
- struct buffer_head **ret_de_bh)
+static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
+ const char *name, int namelen,
+ struct buffer_head **ret_de_bh,
+ unsigned int *blocks_wanted)
{
- unsigned long offset;
- struct buffer_head * bh = NULL;
- unsigned short rec_len;
- struct ocfs2_dinode *fe;
- struct ocfs2_dir_entry *de;
- struct super_block *sb;
- int status;
+ int ret;
+ struct super_block *sb = dir->i_sb;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_dir_entry *de, *last_de = NULL;
+ char *de_buf, *limit;
+ unsigned long offset = 0;
+ unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
- mlog_entry_void();
+ /*
+ * This calculates how many free bytes we'd have in block zero, should
+ * this function force expansion to an extent tree.
+ */
+ if (ocfs2_new_dir_wants_trailer(dir))
+ free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+ else
+ free_space = dir->i_sb->s_blocksize - i_size_read(dir);
- mlog(0, "getting ready to insert namelen %d into dir %llu\n",
- namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+ de_buf = di->id2.i_data.id_data;
+ limit = de_buf + i_size_read(dir);
+ rec_len = OCFS2_DIR_REC_LEN(namelen);
- BUG_ON(!S_ISDIR(dir->i_mode));
- fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
- BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+ while (de_buf < limit) {
+ de = (struct ocfs2_dir_entry *)de_buf;
- sb = dir->i_sb;
+ if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (ocfs2_match(namelen, name, de)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ /*
+ * No need to check for a trailing dirent record here as
+ * they're not used for inline dirs.
+ */
- if (!namelen) {
- status = -EINVAL;
- mlog_errno(status);
- goto bail;
+ if (ocfs2_dirent_would_fit(de, rec_len)) {
+ /* Ok, we found a spot. Return this bh and let
+ * the caller actually fill it in. */
+ *ret_de_bh = di_bh;
+ get_bh(*ret_de_bh);
+ ret = 0;
+ goto out;
+ }
+
+ last_de = de;
+ de_buf += le16_to_cpu(de->rec_len);
+ offset += le16_to_cpu(de->rec_len);
}
- bh = ocfs2_bread(dir, 0, &status, 0);
- if (!bh) {
+ /*
+ * We're going to require expansion of the directory - figure
+ * out how many blocks we'll need so that a place for the
+ * dirent can be found.
+ */
+ *blocks_wanted = 1;
+ new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
+ if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
+ *blocks_wanted = 2;
+
+ ret = -ENOSPC;
+out:
+ return ret;
+}
+
+static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
+ int namelen, struct buffer_head **ret_de_bh)
+{
+ unsigned long offset;
+ struct buffer_head *bh = NULL;
+ unsigned short rec_len;
+ struct ocfs2_dir_entry *de;
+ struct super_block *sb = dir->i_sb;
+ int status;
+ int blocksize = dir->i_sb->s_blocksize;
+
+ status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -561,24 +3468,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
bh = NULL;
if (i_size_read(dir) <= offset) {
- status = ocfs2_extend_dir(osb,
- dir,
- parent_fe_bh,
- &bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- BUG_ON(!bh);
- *ret_de_bh = bh;
- get_bh(*ret_de_bh);
+ /*
+ * Caller will have to expand this
+ * directory.
+ */
+ status = -ENOSPC;
goto bail;
}
- bh = ocfs2_bread(dir,
- offset >> sb->s_blocksize_bits,
- &status,
- 0);
- if (!bh) {
+ status = ocfs2_read_dir_block(dir,
+ offset >> sb->s_blocksize_bits,
+ &bh, 0);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -593,10 +3493,12 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
status = -EEXIST;
goto bail;
}
- if (((le64_to_cpu(de->inode) == 0) &&
- (le16_to_cpu(de->rec_len) >= rec_len)) ||
- (le16_to_cpu(de->rec_len) >=
- (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+
+ if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+ blocksize))
+ goto next;
+
+ if (ocfs2_dirent_would_fit(de, rec_len)) {
/* Ok, we found a spot. Return this bh and let
* the caller actually fill it in. */
*ret_de_bh = bh;
@@ -604,15 +3506,1001 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
status = 0;
goto bail;
}
+next:
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
}
status = 0;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
+ if (status)
+ mlog_errno(status);
- mlog_exit(status);
return status;
}
+
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+ const struct ocfs2_dx_entry *entry1 = a;
+ const struct ocfs2_dx_entry *entry2 = b;
+ u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+ u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+ u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+ u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+
+ if (major_hash1 > major_hash2)
+ return 1;
+ if (major_hash1 < major_hash2)
+ return -1;
+
+ /*
+ * It is not strictly necessary to sort by minor
+ */
+ if (minor_hash1 > minor_hash2)
+ return 1;
+ if (minor_hash1 < minor_hash2)
+ return -1;
+ return 0;
+}
+
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+ struct ocfs2_dx_entry *entry1 = a;
+ struct ocfs2_dx_entry *entry2 = b;
+ struct ocfs2_dx_entry tmp;
+
+ BUG_ON(size != sizeof(*entry1));
+
+ tmp = *entry1;
+ *entry1 = *entry2;
+ *entry2 = tmp;
+}
+
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+ struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+ int i, num = le16_to_cpu(dl_list->de_num_used);
+
+ for (i = 0; i < (num - 1); i++) {
+ if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+ le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+ u32 leaf_cpos, u32 insert_hash,
+ u32 *split_hash)
+{
+ struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+ int i, num_used = le16_to_cpu(dl_list->de_num_used);
+ int allsame;
+
+ /*
+ * There's a couple rare, but nasty corner cases we have to
+ * check for here. All of them involve a leaf where all value
+ * have the same hash, which is what we look for first.
+ *
+ * Most of the time, all of the above is false, and we simply
+ * pick the median value for a split.
+ */
+ allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+ if (allsame) {
+ u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+
+ if (val == insert_hash) {
+ /*
+ * No matter where we would choose to split,
+ * the new entry would want to occupy the same
+ * block as these. Since there's no space left
+ * in their existing block, we know there
+ * won't be space after the split.
+ */
+ return -ENOSPC;
+ }
+
+ if (val == leaf_cpos) {
+ /*
+ * Because val is the same as leaf_cpos (which
+ * is the smallest value this leaf can have),
+ * yet is not equal to insert_hash, then we
+ * know that insert_hash *must* be larger than
+ * val (and leaf_cpos). At least cpos+1 in value.
+ *
+ * We also know then, that there cannot be an
+ * adjacent extent (otherwise we'd be looking
+ * at it). Choosing this value gives us a
+ * chance to get some contiguousness.
+ */
+ *split_hash = leaf_cpos + 1;
+ return 0;
+ }
+
+ if (val > insert_hash) {
+ /*
+ * val can not be the same as insert hash, and
+ * also must be larger than leaf_cpos. Also,
+ * we know that there can't be a leaf between
+ * cpos and val, otherwise the entries with
+ * hash 'val' would be there.
+ */
+ *split_hash = val;
+ return 0;
+ }
+
+ *split_hash = insert_hash;
+ return 0;
+ }
+
+ /*
+ * Since the records are sorted and the checks above
+ * guaranteed that not all records in this block are the same,
+ * we simple travel forward, from the median, and pick the 1st
+ * record whose value is larger than leaf_cpos.
+ */
+ for (i = (num_used / 2); i < num_used; i++)
+ if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+ leaf_cpos)
+ break;
+
+ BUG_ON(i == num_used); /* Should be impossible */
+ *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+ return 0;
+}
+
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+ handle_t *handle,
+ struct ocfs2_dx_leaf *tmp_dx_leaf,
+ struct buffer_head **orig_dx_leaves,
+ struct buffer_head **new_dx_leaves,
+ int num_dx_leaves)
+{
+ int i, j, num_used;
+ u32 major_hash;
+ struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+ struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+ struct ocfs2_dx_entry *dx_entry;
+
+ tmp_list = &tmp_dx_leaf->dl_list;
+
+ for (i = 0; i < num_dx_leaves; i++) {
+ orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+ orig_list = &orig_dx_leaf->dl_list;
+ new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+ new_list = &new_dx_leaf->dl_list;
+
+ num_used = le16_to_cpu(orig_list->de_num_used);
+
+ memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+ tmp_list->de_num_used = cpu_to_le16(0);
+ memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+
+ for (j = 0; j < num_used; j++) {
+ dx_entry = &orig_list->de_entries[j];
+ major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+ if (major_hash >= split_hash)
+ ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+ dx_entry);
+ else
+ ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+ dx_entry);
+ }
+ memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+
+ ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+ ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+ }
+}
+
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+ struct ocfs2_dx_root_block *dx_root)
+{
+ int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+
+ credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
+ credits += ocfs2_quota_trans_credits(osb->sb);
+ return credits;
+}
+
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+ struct buffer_head *dx_root_bh,
+ struct buffer_head *dx_leaf_bh,
+ struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+ u64 leaf_blkno)
+{
+ struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+ int credits, ret, i, num_used, did_quota = 0;
+ u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+ u64 orig_leaves_start;
+ int num_dx_leaves;
+ struct buffer_head **orig_dx_leaves = NULL;
+ struct buffer_head **new_dx_leaves = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+ struct ocfs2_extent_tree et;
+ handle_t *handle = NULL;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+
+ trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)leaf_blkno,
+ insert_hash);
+
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+ /*
+ * XXX: This is a rather large limit. We should use a more
+ * realistic value.
+ */
+ if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+ return -ENOSPC;
+
+ num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+ if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+ mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+ "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)leaf_blkno, num_used);
+ ret = -EIO;
+ goto out;
+ }
+
+ orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+ if (!orig_dx_leaves) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+ if (!new_dx_leaves) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ if (ret)
+ goto out_commit;
+ did_quota = 1;
+
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * This block is changing anyway, so we can sort it in place.
+ */
+ sort(dx_leaf->dl_list.de_entries, num_used,
+ sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+ dx_leaf_sort_swap);
+
+ ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+ ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+ &split_hash);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
+
+ /*
+ * We have to carefully order operations here. There are items
+ * which want to be in the new cluster before insert, but in
+ * order to put those items in the new cluster, we alter the
+ * old cluster. A failure to insert gets nasty.
+ *
+ * So, start by reserving writes to the old
+ * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+ * the new cluster for us, before inserting it. The insert
+ * won't happen if there's an error before that. Once the
+ * insert is done then, we can transfer from one leaf into the
+ * other without fear of hitting any error.
+ */
+
+ /*
+ * The leaf transfer wants some scratch space so that we don't
+ * wind up doing a bunch of expensive memmove().
+ */
+ tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+ if (!tmp_dx_leaf) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+ ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+ orig_dx_leaves);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ cpos = split_hash;
+ ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+ data_ac, meta_ac, new_dx_leaves,
+ num_dx_leaves);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ for (i = 0; i < num_dx_leaves; i++) {
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ orig_dx_leaves[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ new_dx_leaves[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+ orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+
+out_commit:
+ if (ret < 0 && did_quota)
+ dquot_free_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (orig_dx_leaves || new_dx_leaves) {
+ for (i = 0; i < num_dx_leaves; i++) {
+ if (orig_dx_leaves)
+ brelse(orig_dx_leaves[i]);
+ if (new_dx_leaves)
+ brelse(new_dx_leaves[i]);
+ }
+ kfree(orig_dx_leaves);
+ kfree(new_dx_leaves);
+ }
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+
+ kfree(tmp_dx_leaf);
+ return ret;
+}
+
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dx_root_bh,
+ const char *name, int namelen,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret, rebalanced = 0;
+ struct ocfs2_dx_root_block *dx_root;
+ struct buffer_head *dx_leaf_bh = NULL;
+ struct ocfs2_dx_leaf *dx_leaf;
+ u64 blkno;
+ u32 leaf_cpos;
+
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+restart_search:
+ ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+ &leaf_cpos, &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+
+ if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+ le16_to_cpu(dx_leaf->dl_list.de_count)) {
+ if (rebalanced) {
+ /*
+ * Rebalancing should have provided us with
+ * space in an appropriate leaf.
+ *
+ * XXX: Is this an abnormal condition then?
+ * Should we print a message here?
+ */
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+ &lookup->dl_hinfo, leaf_cpos,
+ blkno);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Restart the lookup. The rebalance might have
+ * changed which block our item fits into. Mark our
+ * progress, so we only execute this once.
+ */
+ brelse(dx_leaf_bh);
+ dx_leaf_bh = NULL;
+ rebalanced = 1;
+ goto restart_search;
+ }
+
+ lookup->dl_dx_leaf_bh = dx_leaf_bh;
+ dx_leaf_bh = NULL;
+
+out:
+ brelse(dx_leaf_bh);
+ return ret;
+}
+
+static int ocfs2_search_dx_free_list(struct inode *dir,
+ struct buffer_head *dx_root_bh,
+ int namelen,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret = -ENOSPC;
+ struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+ struct ocfs2_dir_block_trailer *db;
+ u64 next_block;
+ int rec_len = OCFS2_DIR_REC_LEN(namelen);
+ struct ocfs2_dx_root_block *dx_root;
+
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+ next_block = le64_to_cpu(dx_root->dr_free_blk);
+
+ while (next_block) {
+ brelse(prev_leaf_bh);
+ prev_leaf_bh = leaf_bh;
+ leaf_bh = NULL;
+
+ ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+ if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+ lookup->dl_leaf_bh = leaf_bh;
+ lookup->dl_prev_leaf_bh = prev_leaf_bh;
+ leaf_bh = NULL;
+ prev_leaf_bh = NULL;
+ break;
+ }
+
+ next_block = le64_to_cpu(db->db_free_next);
+ }
+
+ if (!next_block)
+ ret = -ENOSPC;
+
+out:
+
+ brelse(leaf_bh);
+ brelse(prev_leaf_bh);
+ return ret;
+}
+
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+ struct buffer_head *dx_root_bh)
+{
+ int ret, num_dx_leaves, i, j, did_quota = 0;
+ struct buffer_head **dx_leaves = NULL;
+ struct ocfs2_extent_tree et;
+ u64 insert_blkno;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ handle_t *handle = NULL;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dx_entry_list *entry_list;
+ struct ocfs2_dx_entry *dx_entry;
+ struct ocfs2_dx_leaf *target_leaf;
+
+ ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+ if (!dx_leaves) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
+ goto out_commit;
+ did_quota = 1;
+
+ /*
+ * We do this up front, before the allocation, so that a
+ * failure to add the dx_root_bh to the journal won't result
+ * us losing clusters.
+ */
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+ num_dx_leaves, &insert_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * Transfer the entries from our dx_root into the appropriate
+ * block
+ */
+ dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+ entry_list = &dx_root->dr_entries;
+
+ for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+ dx_entry = &entry_list->de_entries[i];
+
+ j = __ocfs2_dx_dir_hash_idx(osb,
+ le32_to_cpu(dx_entry->dx_minor_hash));
+ target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+
+ ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+
+ /* Each leaf has been passed to the journal already
+ * via __ocfs2_dx_dir_new_cluster() */
+ }
+
+ dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+ memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+ offsetof(struct ocfs2_dx_root_block, dr_list));
+ dx_root->dr_list.l_count =
+ cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+
+ /* This should never fail considering we start with an empty
+ * dx_root. */
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+ ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
+ if (ret)
+ mlog_errno(ret);
+ did_quota = 0;
+
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, dx_root_bh);
+
+out_commit:
+ if (ret < 0 && did_quota)
+ dquot_free_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+
+ if (dx_leaves) {
+ for (i = 0; i < num_dx_leaves; i++)
+ brelse(dx_leaves[i]);
+ kfree(dx_leaves);
+ }
+ return ret;
+}
+
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dx_entry_list *entry_list;
+
+ dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+ entry_list = &dx_root->dr_entries;
+
+ if (le16_to_cpu(entry_list->de_num_used) >=
+ le16_to_cpu(entry_list->de_count))
+ return -ENOSPC;
+
+ return 0;
+}
+
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+ struct buffer_head *di_bh,
+ const char *name,
+ int namelen,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret, free_dx_root = 1;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct buffer_head *dx_root_bh = NULL;
+ struct buffer_head *leaf_bh = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_dx_root_block *dx_root;
+
+ ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+ if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+ ret = -ENOSPC;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_dx_root_inline(dx_root)) {
+ ret = ocfs2_inline_dx_has_space(dx_root_bh);
+
+ if (ret == 0)
+ goto search_el;
+
+ /*
+ * We ran out of room in the root block. Expand it to
+ * an extent, then allow ocfs2_find_dir_space_dx to do
+ * the rest.
+ */
+ ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * Insert preparation for an indexed directory is split into two
+ * steps. The call to find_dir_space_dx reserves room in the index for
+ * an additional item. If we run out of space there, it's a real error
+ * we can't continue on.
+ */
+ ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+ namelen, lookup);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+search_el:
+ /*
+ * Next, we need to find space in the unindexed tree. This call
+ * searches using the free space linked list. If the unindexed tree
+ * lacks sufficient space, we'll expand it below. The expansion code
+ * is smart enough to add any new blocks to the free space list.
+ */
+ ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+ if (ret && ret != -ENOSPC) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Do this up here - ocfs2_extend_dir might need the dx_root */
+ lookup->dl_dx_root_bh = dx_root_bh;
+ free_dx_root = 0;
+
+ if (ret == -ENOSPC) {
+ ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
+
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We make the assumption here that new leaf blocks are added
+ * to the front of our free list.
+ */
+ lookup->dl_prev_leaf_bh = NULL;
+ lookup->dl_leaf_bh = leaf_bh;
+ }
+
+out:
+ if (free_dx_root)
+ brelse(dx_root_bh);
+ return ret;
+}
+
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+ struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ const char *name,
+ int namelen,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret;
+ unsigned int blocks_wanted = 1;
+ struct buffer_head *bh = NULL;
+
+ trace_ocfs2_prepare_dir_for_insert(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
+
+ if (!namelen) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Do this up front to reduce confusion.
+ *
+ * The directory might start inline, then be turned into an
+ * indexed one, in which case we'd need to hash deep inside
+ * ocfs2_find_dir_space_id(). Since
+ * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+ * done, there seems no point in spreading out the calls. We
+ * can optimize away the case where the file system doesn't
+ * support indexing.
+ */
+ if (ocfs2_supports_indexed_dirs(osb))
+ ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+
+ if (ocfs2_dir_indexed(dir)) {
+ ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+ name, namelen, lookup);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
+ namelen, &bh, &blocks_wanted);
+ } else
+ ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
+
+ if (ret && ret != -ENOSPC) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ret == -ENOSPC) {
+ /*
+ * We have to expand the directory to add this name.
+ */
+ BUG_ON(bh);
+
+ ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
+ lookup, &bh);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!bh);
+ }
+
+ lookup->dl_leaf_bh = bh;
+ bh = NULL;
+out:
+ brelse(bh);
+ return ret;
+}
+
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dx_root_bh)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_dx_root_block *dx_root;
+ struct inode *dx_alloc_inode = NULL;
+ struct buffer_head *dx_alloc_bh = NULL;
+ handle_t *handle;
+ u64 blk;
+ u16 bit;
+ u64 bg_blkno;
+
+ dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+ dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(dx_root->dr_suballoc_slot));
+ if (!dx_alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&dx_alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&OCFS2_I(dir)->ip_lock);
+ OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+ di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
+ di->i_dx_root = cpu_to_le64(0ULL);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ blk = le64_to_cpu(dx_root->dr_blkno);
+ bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+ if (dx_root->dr_suballoc_loc)
+ bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+ bit, bg_blkno, 1);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+ ocfs2_inode_unlock(dx_alloc_inode, 1);
+
+out_mutex:
+ mutex_unlock(&dx_alloc_inode->i_mutex);
+ brelse(dx_alloc_bh);
+out:
+ iput(dx_alloc_inode);
+ return ret;
+}
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+ int ret;
+ unsigned int uninitialized_var(clen);
+ u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+ u64 uninitialized_var(blkno);
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct buffer_head *dx_root_bh = NULL;
+ struct ocfs2_dx_root_block *dx_root;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (!ocfs2_dir_indexed(dir))
+ return 0;
+
+ ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+ if (ocfs2_dx_root_inline(dx_root))
+ goto remove_index;
+
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+
+ /* XXX: What if dr_clusters is too large? */
+ while (le32_to_cpu(dx_root->dr_clusters)) {
+ ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+ major_hash, &cpos, &blkno, &clen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+
+ ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+ &dealloc, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (cpos == 0)
+ break;
+
+ major_hash = cpos - 1;
+ }
+
+remove_index:
+ ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ brelse(dx_root_bh);
+ return ret;
+}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 5f614ec9649..f0344b75b14 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,29 +26,91 @@
#ifndef OCFS2_DIR_H
#define OCFS2_DIR_H
+struct ocfs2_dx_hinfo {
+ u32 major_hash;
+ u32 minor_hash;
+};
+
+struct ocfs2_dir_lookup_result {
+ struct buffer_head *dl_leaf_bh; /* Unindexed leaf
+ * block */
+ struct ocfs2_dir_entry *dl_entry; /* Target dirent in
+ * unindexed leaf */
+
+ struct buffer_head *dl_dx_root_bh; /* Root of indexed
+ * tree */
+
+ struct buffer_head *dl_dx_leaf_bh; /* Indexed leaf block */
+ struct ocfs2_dx_entry *dl_dx_entry; /* Target dx_entry in
+ * indexed leaf */
+ struct ocfs2_dx_hinfo dl_hinfo; /* Name hash results */
+
+ struct buffer_head *dl_prev_leaf_bh;/* Previous entry in
+ * dir free space
+ * list. NULL if
+ * previous entry is
+ * dx root block. */
+};
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+
+int ocfs2_find_entry(const char *name, int namelen,
+ struct inode *dir,
+ struct ocfs2_dir_lookup_result *lookup);
+int ocfs2_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_dir_lookup_result *res);
+int __ocfs2_add_entry(handle_t *handle,
+ struct inode *dir,
+ const char *name, int namelen,
+ struct inode *inode, u64 blkno,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_dir_lookup_result *lookup);
+static inline int ocfs2_add_entry(handle_t *handle,
+ struct dentry *dentry,
+ struct inode *inode, u64 blkno,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+ dentry->d_name.name, dentry->d_name.len,
+ inode, blkno, parent_fe_bh, lookup);
+}
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+ struct ocfs2_dir_lookup_result *res,
+ struct inode *new_entry_inode);
+
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen);
-int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
+int ocfs2_empty_dir(struct inode *inode);
+
int ocfs2_find_files_on_disk(const char *name,
int namelen,
u64 *blkno,
struct inode *inode,
- struct buffer_head **dirent_bh,
- struct ocfs2_dir_entry **dirent);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+ struct ocfs2_dir_lookup_result *res);
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+ int namelen, u64 *blkno);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
const char *name,
int namelen,
- struct buffer_head **ret_de_bh);
+ struct ocfs2_dir_lookup_result *lookup);
struct ocfs2_alloc_context;
-int ocfs2_do_extend_dir(struct super_block *sb,
- struct ocfs2_journal_handle *handle,
- struct inode *dir,
- struct buffer_head *parent_fe_bh,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- struct buffer_head **new_bh);
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *parent,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac);
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
+
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+ void *data);
#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d27..bd1aab1f49a 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
- dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index cfd5cb65cab..3cfa114aa39 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
} while (0)
-#define DLM_LKSB_UNUSED1 0x01
+#define DLM_LKSB_UNUSED1 0x01
#define DLM_LKSB_PUT_LVB 0x02
#define DLM_LKSB_GET_LVB 0x04
#define DLM_LKSB_UNUSED2 0x08
@@ -193,7 +193,12 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
dlm_astunlockfunc_t *unlockast,
void *data);
-struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
+struct dlm_protocol_version {
+ u8 pv_major;
+ u8 pv_minor;
+};
+struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
+ struct dlm_protocol_version *fs_proto);
void dlm_unregister_domain(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 681046d5139..b46278f9ae4 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -43,7 +41,6 @@
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
-#include "cluster/endian.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -91,22 +88,31 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
return 0;
}
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+ res = lock->lockres;
+
assert_spin_locked(&dlm->ast_lock);
+
if (!list_empty(&lock->ast_list)) {
- mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+ "AST list not empty, pending %d, newlevel %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ast_pending, lock->ml.type);
BUG();
}
- BUG_ON(!list_empty(&lock->ast_list));
if (lock->ast_pending)
- mlog(0, "lock has an ast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -114,9 +120,10 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
/* check to see if this ast obsoletes the bast */
if (dlm_should_cancel_bast(dlm, lock)) {
- struct dlm_lock_resource *res = lock->lockres;
- mlog(0, "%s: cancelling bast for %.*s\n",
- dlm->name, res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lock->bast_pending = 0;
list_del_init(&lock->bast_list);
lock->ml.highest_blocked = LKM_IVMODE;
@@ -126,7 +133,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
dlm_lock_put(lock);
/* free up the reserved bast that we are cancelling.
* guaranteed that this will not be the last reserved
- * ast because *both* an ast and a bast were reserved
+ * ast because *both* an ast and a bast were reserved
* to get to this point. the res->spinlock will not be
* taken here */
dlm_lockres_release_ast(dlm, res);
@@ -138,8 +145,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -149,17 +154,23 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
}
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+
assert_spin_locked(&dlm->ast_lock);
+ res = lock->lockres;
+
BUG_ON(!list_empty(&lock->bast_list));
if (lock->bast_pending)
- mlog(0, "lock has a bast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -171,8 +182,6 @@ static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -188,9 +197,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
BUG_ON(!lksb);
/* only updates if this node masters the lockres */
+ spin_lock(&res->spinlock);
if (res->owner == dlm->node_num) {
-
- spin_lock(&res->spinlock);
/* check the lksb flags for the direction */
if (lksb->flags & DLM_LKSB_GET_LVB) {
mlog(0, "getting lvb from lockres for %s node\n",
@@ -205,8 +213,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* here. In the future we might want to clear it at the time
* the put is actually done.
*/
- spin_unlock(&res->spinlock);
}
+ spin_unlock(&res->spinlock);
/* reset any lvb flags on the lksb */
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -218,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
dlm_astlockfunc_t *fn;
struct dlm_lockstatus *lksb;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
fn = lock->ast;
@@ -236,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lockstatus *lksb;
int lksbflags;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
BUG_ON(lock->ml.node == dlm->node_num);
@@ -255,15 +269,21 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
{
dlm_bastlockfunc_t *fn = lock->bast;
- mlog_entry_void();
BUG_ON(lock->ml.node != dlm->node_num);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ blocked_type);
+
(*fn)(lock->astdata, blocked_type);
}
-int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
int ret;
unsigned int locklen;
@@ -272,9 +292,10 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
struct dlm_lock *lock = NULL;
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
- struct list_head *iter, *head=NULL;
- u64 cookie;
+ struct list_head *head = NULL;
+ __be64 cookie;
u32 flags;
+ u8 node;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
@@ -286,18 +307,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
name = past->name;
locklen = past->namelen;
- cookie = be64_to_cpu(past->cookie);
+ cookie = past->cookie;
flags = be32_to_cpu(past->flags);
+ node = past->node_idx;
if (locklen > DLM_LOCKID_NAME_MAX) {
ret = DLM_IVBUFLEN;
- mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+ mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+ "handler!\n", locklen);
goto leave;
}
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
- mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+ mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+ flags);
ret = DLM_BADARGS;
goto leave;
}
@@ -310,22 +334,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
if (past->type != DLM_AST &&
past->type != DLM_BAST) {
mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
- "name=%.*s\n", past->type,
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name);
+ "name=%.*s, node=%u\n", past->type,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
res = dlm_lookup_lockres(dlm, name, locklen);
if (!res) {
- mlog(0, "got %sast for unknown lockres! "
- "cookie=%u:%llu, name=%.*s, namelen=%u\n",
- past->type == DLM_AST ? "" : "b",
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name, locklen);
+ mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+ "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
@@ -333,25 +356,25 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
/* cannot get a proxy ast message if this node owns it */
BUG_ON(res->owner == dlm->node_num);
- mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
- mlog(0, "responding with DLM_RECOVERING!\n");
+ mlog(0, "Responding with DLM_RECOVERING!\n");
ret = DLM_RECOVERING;
goto unlock_out;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "responding with DLM_MIGRATING!\n");
+ mlog(0, "Responding with DLM_MIGRATING!\n");
ret = DLM_MIGRATING;
goto unlock_out;
}
/* try convert queue for both ast/bast */
head = &res->converting;
lock = NULL;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
- if (be64_to_cpu(lock->ml.cookie) == cookie)
+ list_for_each_entry(lock, head, list) {
+ if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -361,16 +384,16 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
else
head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
- if (be64_to_cpu(lock->ml.cookie) == cookie)
+ list_for_each_entry(lock, head, list) {
+ if (lock->ml.cookie == cookie)
goto do_ast;
}
- mlog(0, "got %sast for unknown lock! cookie=%u:%llu, "
- "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b",
- dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie),
- locklen, name, locklen);
+ mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+ "node=%u\n", past->type == DLM_AST ? "" : "b",
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_NORMAL;
unlock_out:
@@ -382,8 +405,12 @@ do_ast:
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
- mlog(0, "ast: adding to granted list... type=%d, "
- "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ lock->ml.type, lock->ml.convert_type);
+
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
@@ -407,7 +434,6 @@ do_ast:
dlm_do_local_bast(dlm, res, lock, past->blocked_type);
leave:
-
if (res)
dlm_lockres_put(res);
@@ -427,9 +453,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
size_t veclen = 1;
int status;
- mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
- res->lockname.len, res->lockname.name, lock->ml.node,
- msg_type, blocked_type);
+ mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+ blocked_type);
memset(&past, 0, sizeof(struct dlm_proxy_ast));
past.node_idx = dlm->node_num;
@@ -442,7 +468,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
vec[0].iov_base = &past;
if (flags & DLM_LKSB_GET_LVB) {
- mlog(0, "returning requested LVB data\n");
be32_add_cpu(&past.flags, LKM_GET_LVB);
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
@@ -452,7 +477,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name, ret,
+ lock->ml.node);
else {
if (status == DLM_RECOVERING) {
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fa968180b07..fae17c640df 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
#define DLM_THREAD_MS 200 // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
# define DLM_HASH_PAGES 1
#else
@@ -49,10 +49,41 @@
/* Intended to make it easier for us to switch out hash functions */
#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+enum dlm_mle_type {
+ DLM_MLE_BLOCK = 0,
+ DLM_MLE_MASTER = 1,
+ DLM_MLE_MIGRATION = 2,
+ DLM_MLE_NUM_TYPES = 3,
+};
+
+struct dlm_master_list_entry {
+ struct hlist_node master_hash_node;
+ struct list_head hb_events;
+ struct dlm_ctxt *dlm;
+ spinlock_t spinlock;
+ wait_queue_head_t wq;
+ atomic_t woken;
+ struct kref mle_refs;
+ int inuse;
+ unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ u8 master;
+ u8 new_master;
+ enum dlm_mle_type type;
+ struct o2hb_callback_func mle_hb_up;
+ struct o2hb_callback_func mle_hb_down;
+ struct dlm_lock_resource *mleres;
+ unsigned char mname[DLM_LOCKID_NAME_MAX];
+ unsigned int mnamelen;
+ unsigned int mnamehash;
+};
+
enum dlm_ast_type {
DLM_AST = 0,
- DLM_BAST,
- DLM_ASTUNLOCK
+ DLM_BAST = 1,
+ DLM_ASTUNLOCK = 2,
};
@@ -77,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
struct dlm_recovery_ctxt
{
struct list_head resources;
- struct list_head received;
struct list_head node_data;
u8 new_master;
u8 dead_node;
@@ -88,9 +118,9 @@ struct dlm_recovery_ctxt
enum dlm_ctxt_state {
DLM_CTXT_NEW = 0,
- DLM_CTXT_JOINED,
- DLM_CTXT_IN_SHUTDOWN,
- DLM_CTXT_LEAVING,
+ DLM_CTXT_JOINED = 1,
+ DLM_CTXT_IN_SHUTDOWN = 2,
+ DLM_CTXT_LEAVING = 3,
};
struct dlm_ctxt
@@ -101,9 +131,11 @@ struct dlm_ctxt
struct list_head purge_list;
struct list_head pending_asts;
struct list_head pending_basts;
+ struct list_head tracking_list;
unsigned int purge_count;
spinlock_t spinlock;
spinlock_t ast_lock;
+ spinlock_t track_lock;
char *name;
u8 node_num;
u32 key;
@@ -111,16 +143,21 @@ struct dlm_ctxt
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct dlm_recovery_ctxt reco;
spinlock_t master_lock;
- struct list_head master_list;
+ struct hlist_head **master_hash;
struct list_head mle_hb_events;
/* these give a really vague idea of the system load */
- atomic_t local_resources;
- atomic_t remote_resources;
- atomic_t unknown_resources;
+ atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
+ atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
+ atomic_t res_tot_count;
+ atomic_t res_cur_count;
+
+ struct dlm_debug_ctxt *dlm_debug_ctxt;
+ struct dentry *dlm_debugfs_subroot;
/* NOTE: Next three are protected by dlm_domain_lock */
struct kref dlm_refs;
@@ -142,6 +179,12 @@ struct dlm_ctxt
spinlock_t work_lock;
struct list_head dlm_domain_handlers;
struct list_head dlm_eviction_callbacks;
+
+ /* The filesystem specifies this at domain registration. We
+ * cache it here to know what to tell other nodes. */
+ struct dlm_protocol_version fs_locking_proto;
+ /* This is the inter-dlm communication version */
+ struct dlm_protocol_version dlm_locking_proto;
};
static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
@@ -149,11 +192,18 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
}
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+ unsigned i)
+{
+ return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+ (i % DLM_BUCKETS_PER_PAGE);
+}
+
/* these keventd work queue items are for less-frequently
* called functions that cannot be directly called from the
* net message handlers for some reason, usually because
* they need to send net messages of their own. */
-void dlm_dispatch_work(void *data);
+void dlm_dispatch_work(struct work_struct *work);
struct dlm_lock_resource;
struct dlm_work_item;
@@ -170,6 +220,7 @@ struct dlm_mig_lockres_priv
{
struct dlm_lock_resource *lockres;
u8 real_master;
+ u8 extra_ref;
};
struct dlm_assert_master_priv
@@ -180,6 +231,11 @@ struct dlm_assert_master_priv
unsigned ignore_higher:1;
};
+struct dlm_deref_lockres_priv
+{
+ struct dlm_lock_resource *deref_res;
+ u8 deref_node;
+};
struct dlm_work_item
{
@@ -191,6 +247,7 @@ struct dlm_work_item
struct dlm_request_all_locks_priv ral;
struct dlm_mig_lockres_priv ml;
struct dlm_assert_master_priv am;
+ struct dlm_deref_lockres_priv dl;
} u;
};
@@ -222,6 +279,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_DIRTY 0x00000008
#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
#define DLM_LOCK_RES_MIGRATING 0x00000020
+#define DLM_LOCK_RES_DROPPING_REF 0x00000040
+#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
+#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -254,10 +314,15 @@ struct dlm_lock_resource
struct list_head dirty;
struct list_head recovering; // dlm_recovery_ctxt.resources list
+ /* Added during init and removed during release */
+ struct list_head tracking; /* dlm->tracking_list */
+
/* unused lock resources have their last_used stamped and are
* put on a list for the dlm thread to run. */
unsigned long last_used;
+ struct dlm_ctxt *dlm;
+
unsigned migration_pending:1;
atomic_t asts_reserved;
spinlock_t spinlock;
@@ -265,6 +330,9 @@ struct dlm_lock_resource
u8 owner; //node which owns the lock resource, or unknown
u16 state;
char lvb[DLM_LVB_LEN];
+ unsigned int inflight_locks;
+ unsigned int inflight_assert_workers;
+ unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
struct dlm_migratable_lock
@@ -321,8 +389,8 @@ struct dlm_lock
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
- DLM_CONVERTING_LIST,
- DLM_BLOCKED_LIST
+ DLM_CONVERTING_LIST = 1,
+ DLM_BLOCKED_LIST = 2,
};
static inline int dlm_lvb_is_empty(char *lvb)
@@ -334,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
return 1;
}
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+ if (idx == DLM_GRANTED_LIST)
+ return "granted";
+ else if (idx == DLM_CONVERTING_LIST)
+ return "converting";
+ else if (idx == DLM_BLOCKED_LIST)
+ return "blocked";
+ else
+ return "unknown";
+}
+
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
@@ -360,25 +440,28 @@ struct dlm_node_iter
enum {
- DLM_MASTER_REQUEST_MSG = 500,
- DLM_UNUSED_MSG1, /* 501 */
- DLM_ASSERT_MASTER_MSG, /* 502 */
- DLM_CREATE_LOCK_MSG, /* 503 */
- DLM_CONVERT_LOCK_MSG, /* 504 */
- DLM_PROXY_AST_MSG, /* 505 */
- DLM_UNLOCK_LOCK_MSG, /* 506 */
- DLM_UNUSED_MSG2, /* 507 */
- DLM_MIGRATE_REQUEST_MSG, /* 508 */
- DLM_MIG_LOCKRES_MSG, /* 509 */
- DLM_QUERY_JOIN_MSG, /* 510 */
- DLM_ASSERT_JOINED_MSG, /* 511 */
- DLM_CANCEL_JOIN_MSG, /* 512 */
- DLM_EXIT_DOMAIN_MSG, /* 513 */
- DLM_MASTER_REQUERY_MSG, /* 514 */
- DLM_LOCK_REQUEST_MSG, /* 515 */
- DLM_RECO_DATA_DONE_MSG, /* 516 */
- DLM_BEGIN_RECO_MSG, /* 517 */
- DLM_FINALIZE_RECO_MSG /* 518 */
+ DLM_MASTER_REQUEST_MSG = 500,
+ DLM_UNUSED_MSG1 = 501,
+ DLM_ASSERT_MASTER_MSG = 502,
+ DLM_CREATE_LOCK_MSG = 503,
+ DLM_CONVERT_LOCK_MSG = 504,
+ DLM_PROXY_AST_MSG = 505,
+ DLM_UNLOCK_LOCK_MSG = 506,
+ DLM_DEREF_LOCKRES_MSG = 507,
+ DLM_MIGRATE_REQUEST_MSG = 508,
+ DLM_MIG_LOCKRES_MSG = 509,
+ DLM_QUERY_JOIN_MSG = 510,
+ DLM_ASSERT_JOINED_MSG = 511,
+ DLM_CANCEL_JOIN_MSG = 512,
+ DLM_EXIT_DOMAIN_MSG = 513,
+ DLM_MASTER_REQUERY_MSG = 514,
+ DLM_LOCK_REQUEST_MSG = 515,
+ DLM_RECO_DATA_DONE_MSG = 516,
+ DLM_BEGIN_RECO_MSG = 517,
+ DLM_FINALIZE_RECO_MSG = 518,
+ DLM_QUERY_REGION = 519,
+ DLM_QUERY_NODEINFO = 520,
+ DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
};
struct dlm_reco_node_data
@@ -391,19 +474,19 @@ struct dlm_reco_node_data
enum {
DLM_RECO_NODE_DATA_DEAD = -1,
DLM_RECO_NODE_DATA_INIT = 0,
- DLM_RECO_NODE_DATA_REQUESTING,
- DLM_RECO_NODE_DATA_REQUESTED,
- DLM_RECO_NODE_DATA_RECEIVING,
- DLM_RECO_NODE_DATA_DONE,
- DLM_RECO_NODE_DATA_FINALIZE_SENT,
+ DLM_RECO_NODE_DATA_REQUESTING = 1,
+ DLM_RECO_NODE_DATA_REQUESTED = 2,
+ DLM_RECO_NODE_DATA_RECEIVING = 3,
+ DLM_RECO_NODE_DATA_DONE = 4,
+ DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
};
enum {
DLM_MASTER_RESP_NO = 0,
- DLM_MASTER_RESP_YES,
- DLM_MASTER_RESP_MAYBE,
- DLM_MASTER_RESP_ERROR
+ DLM_MASTER_RESP_YES = 1,
+ DLM_MASTER_RESP_MAYBE = 2,
+ DLM_MASTER_RESP_ERROR = 3,
};
@@ -417,6 +500,9 @@ struct dlm_master_request
u8 name[O2NM_MAX_NAME_LEN];
};
+#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001
+#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002
+
#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001
#define DLM_ASSERT_MASTER_REQUERY 0x00000002
#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
@@ -430,6 +516,8 @@ struct dlm_assert_master
u8 name[O2NM_MAX_NAME_LEN];
};
+#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001
+
struct dlm_migrate_request
{
u8 master;
@@ -573,10 +661,26 @@ struct dlm_proxy_ast
#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
#define DLM_MOD_KEY (0x666c6172)
-enum dlm_query_join_response {
+enum dlm_query_join_response_code {
JOIN_DISALLOW = 0,
- JOIN_OK,
- JOIN_OK_NO_MAP,
+ JOIN_OK = 1,
+ JOIN_OK_NO_MAP = 2,
+ JOIN_PROTOCOL_MISMATCH = 3,
+};
+
+struct dlm_query_join_packet {
+ u8 code; /* Response code. dlm_minor and fs_minor
+ are only valid if this is JOIN_OK */
+ u8 dlm_minor; /* The minor version of the protocol the
+ dlm is speaking. */
+ u8 fs_minor; /* The minor version of the protocol the
+ filesystem is speaking. */
+ u8 reserved;
+};
+
+union dlm_query_join_response {
+ __be32 intval;
+ struct dlm_query_join_packet packet;
};
struct dlm_lock_request
@@ -609,12 +713,18 @@ struct dlm_begin_reco
};
+#define BITS_PER_BYTE 8
+#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
+
struct dlm_query_join_request
{
u8 node_idx;
u8 pad1[2];
u8 name_len;
+ struct dlm_protocol_version dlm_proto;
+ struct dlm_protocol_version fs_proto;
u8 domain[O2NM_MAX_NAME_LEN];
+ u8 node_map[BITS_TO_BYTES(O2NM_MAX_NODES)];
};
struct dlm_assert_joined
@@ -633,6 +743,31 @@ struct dlm_cancel_join
u8 domain[O2NM_MAX_NAME_LEN];
};
+struct dlm_query_region {
+ u8 qr_node;
+ u8 qr_numregions;
+ u8 qr_namelen;
+ u8 pad1;
+ u8 qr_domain[O2NM_MAX_NAME_LEN];
+ u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+ u8 ni_nodenum;
+ u8 pad1;
+ __be16 ni_ipv4_port;
+ __be32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+ u8 qn_nodenum;
+ u8 qn_numnodes;
+ u8 qn_namelen;
+ u8 pad1;
+ u8 qn_domain[O2NM_MAX_NAME_LEN];
+ struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
struct dlm_exit_domain
{
u8 node_idx;
@@ -648,6 +783,16 @@ struct dlm_finalize_reco
__be32 pad2;
};
+struct dlm_deref_lockres
+{
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
@@ -688,16 +833,20 @@ void dlm_lock_put(struct dlm_lock *lock);
void dlm_lock_attach_lockres(struct dlm_lock *lock,
struct dlm_lock_resource *res);
-int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
void dlm_revert_pending_convert(struct dlm_lock_resource *res,
struct dlm_lock *lock);
void dlm_revert_pending_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock);
-int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
struct dlm_lock *lock);
void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
@@ -710,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -721,8 +870,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
-void dlm_purge_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *lockres);
static inline void dlm_lockres_get(struct dlm_lock_resource *res)
{
/* This is called on every lookup, so it might be worth
@@ -730,9 +877,12 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
kref_get(&res->refs);
}
void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+ const char *name,
+ unsigned int len,
+ unsigned int hash);
struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
@@ -742,9 +892,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
unsigned int len);
int dlm_is_host_down(int errno);
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- u8 owner);
+
struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
const char *lockid,
int namelen,
@@ -753,8 +901,23 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen);
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock);
@@ -801,10 +964,7 @@ int dlm_heartbeat_init(struct dlm_ctxt *dlm);
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
-int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
-int dlm_migrate_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- u8 target);
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
int dlm_finish_migration(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 old_master);
@@ -812,15 +972,27 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
-int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
-int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master);
@@ -852,14 +1024,27 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
DLM_LOCK_RES_MIGRATING));
}
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
int dlm_init_mle_cache(void);
void dlm_destroy_mle_cache(void);
+
void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
void dlm_clean_master_list(struct dlm_ctxt *dlm,
u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
-
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
int __dlm_lockres_unused(struct dlm_lock_resource *res);
static inline const char * dlm_lock_mode_name(int mode)
@@ -897,11 +1082,9 @@ static inline int dlm_lock_compatible(int existing, int request)
static inline int dlm_lock_on_list(struct list_head *head,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, head) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, head, list) {
if (tmplock == lock)
return 1;
}
@@ -945,6 +1128,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
return bit;
}
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res,
+ u8 owner)
+{
+ assert_spin_locked(&res->spinlock);
+
+ res->owner = owner;
+}
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res,
+ u8 owner)
+{
+ assert_spin_locked(&res->spinlock);
+
+ if (owner != res->owner)
+ dlm_set_lockres_owner(dlm, res, owner);
+}
#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index c764dc8e40a..e36d63ff178 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -125,13 +123,12 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
int *kick_thread)
{
enum dlm_status status = DLM_NORMAL;
- struct list_head *iter;
struct dlm_lock *tmplock=NULL;
assert_spin_locked(&res->spinlock);
- mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
- lock->ml.type, lock->ml.convert_type, type);
+ mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+ lock->ml.type, lock->ml.convert_type, type);
spin_lock(&lock->spinlock);
@@ -187,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
/* upconvert from here on */
status = DLM_NORMAL;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->granted, list) {
if (tmplock == lock)
continue;
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
/* existing conversion requests take precedence */
@@ -286,8 +281,8 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
__dlm_print_one_lock_resource(res);
mlog(ML_ERROR, "converting a remote lock that is already "
"converting! (cookie=%u:%llu, conv=%d)\n",
- dlm_get_lock_cookie_node(lock->ml.cookie),
- dlm_get_lock_cookie_seq(lock->ml.cookie),
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ml.convert_type);
status = DLM_DENIED;
goto bail;
@@ -355,7 +350,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct kvec vec[2];
size_t veclen = 1;
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
memset(&convert, 0, sizeof(struct dlm_convert_lock));
convert.node_idx = dlm->node_num;
@@ -392,12 +387,14 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
dlm_error(ret);
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+ res->owner);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
* to notice the node is dead. times out after 5s. */
- dlm_wait_for_node_death(dlm, res->owner,
+ dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -418,17 +415,18 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
* returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
* status from __dlmconvert_master
*/
-int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
+ struct dlm_lock *tmp_lock;
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
- int call_ast = 0, kick_thread = 0, ast_reserved = 0;
+ int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
@@ -470,34 +468,22 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
dlm_error(status);
goto leave;
}
- list_for_each(iter, &res->granted) {
- lock = list_entry(iter, struct dlm_lock, list);
- if (lock->ml.cookie == cnv->cookie &&
- lock->ml.node == cnv->node_idx) {
+ list_for_each_entry(tmp_lock, &res->granted, list) {
+ if (tmp_lock->ml.cookie == cnv->cookie &&
+ tmp_lock->ml.node == cnv->node_idx) {
+ lock = tmp_lock;
dlm_lock_get(lock);
break;
}
- lock = NULL;
- }
- if (!lock) {
- __dlm_print_one_lock_resource(res);
- list_for_each(iter, &res->granted) {
- lock = list_entry(iter, struct dlm_lock, list);
- if (lock->ml.node == cnv->node_idx) {
- mlog(ML_ERROR, "There is something here "
- "for node %u, lock->ml.cookie=%llu, "
- "cnv->cookie=%llu\n", cnv->node_idx,
- (unsigned long long)lock->ml.cookie,
- (unsigned long long)cnv->cookie);
- break;
- }
- }
- lock = NULL;
}
spin_unlock(&res->spinlock);
if (!lock) {
status = DLM_IVLOCKID;
- dlm_error(status);
+ mlog(ML_ERROR, "did not find lock to convert on grant queue! "
+ "cookie=%u:%llu\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
+ dlm_print_one_lock_resource(res);
goto leave;
}
@@ -524,8 +510,11 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
cnv->requested_type,
&call_ast, &kick_thread);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+ wake = 1;
}
spin_unlock(&res->spinlock);
+ if (wake)
+ wake_up(&res->wq);
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
@@ -534,12 +523,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
}
leave:
- if (!lock)
- mlog(ML_ERROR, "did not find lock to convert on grant queue! "
- "cookie=%u:%llu\n",
- dlm_get_lock_cookie_node(cnv->cookie),
- dlm_get_lock_cookie_seq(cnv->cookie));
- else
+ if (lock)
dlm_lock_put(lock);
/* either queue the ast or release it, if reserved */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 3f6c8d88f7a..18f13c2e4a1 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
*
* debug functionality for the dlm
*
- * Copyright (C) 2004 Oracle. All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -27,9 +27,10 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
@@ -37,111 +38,103 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-
#include "dlmdomain.h"
+#include "dlmdebug.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
+static int stringify_lockname(const char *lockname, int locklen, char *buf,
+ int len);
+
void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
- mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
- res->lockname.len, res->lockname.name,
- res->owner, res->state);
spin_lock(&res->spinlock);
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
}
-void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
+static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
{
- struct list_head *iter2;
- struct dlm_lock *lock;
-
+ int bit;
assert_spin_locked(&res->spinlock);
- mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
- res->lockname.len, res->lockname.name,
- res->owner, res->state);
- mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
- res->last_used, list_empty(&res->purge) ? "no" : "yes");
- mlog(ML_NOTICE, " granted queue: \n");
- list_for_each(iter2, &res->granted) {
- lock = list_entry(iter2, struct dlm_lock, list);
- spin_lock(&lock->spinlock);
- mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
- "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
- lock->ml.type, lock->ml.convert_type, lock->ml.node,
- dlm_get_lock_cookie_node(lock->ml.cookie),
- dlm_get_lock_cookie_seq(lock->ml.cookie),
- list_empty(&lock->ast_list) ? 'y' : 'n',
- lock->ast_pending ? 'y' : 'n',
- list_empty(&lock->bast_list) ? 'y' : 'n',
- lock->bast_pending ? 'y' : 'n');
- spin_unlock(&lock->spinlock);
- }
- mlog(ML_NOTICE, " converting queue: \n");
- list_for_each(iter2, &res->converting) {
- lock = list_entry(iter2, struct dlm_lock, list);
- spin_lock(&lock->spinlock);
- mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
- "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
- lock->ml.type, lock->ml.convert_type, lock->ml.node,
- dlm_get_lock_cookie_node(lock->ml.cookie),
- dlm_get_lock_cookie_seq(lock->ml.cookie),
- list_empty(&lock->ast_list) ? 'y' : 'n',
- lock->ast_pending ? 'y' : 'n',
- list_empty(&lock->bast_list) ? 'y' : 'n',
- lock->bast_pending ? 'y' : 'n');
- spin_unlock(&lock->spinlock);
- }
- mlog(ML_NOTICE, " blocked queue: \n");
- list_for_each(iter2, &res->blocked) {
- lock = list_entry(iter2, struct dlm_lock, list);
- spin_lock(&lock->spinlock);
- mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
- "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
- lock->ml.type, lock->ml.convert_type, lock->ml.node,
- dlm_get_lock_cookie_node(lock->ml.cookie),
- dlm_get_lock_cookie_seq(lock->ml.cookie),
- list_empty(&lock->ast_list) ? 'y' : 'n',
- lock->ast_pending ? 'y' : 'n',
- list_empty(&lock->bast_list) ? 'y' : 'n',
- lock->bast_pending ? 'y' : 'n');
- spin_unlock(&lock->spinlock);
+ printk(" refmap nodes: [ ");
+ bit = 0;
+ while (1) {
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+ if (bit >= O2NM_MAX_NODES)
+ break;
+ printk("%u ", bit);
+ bit++;
}
+ printk("], inflight=%u\n", res->inflight_locks);
}
-void dlm_print_one_lock(struct dlm_lock *lockid)
+static void __dlm_print_lock(struct dlm_lock *lock)
{
- dlm_print_one_lock_resource(lockid->lockres);
+ spin_lock(&lock->spinlock);
+
+ printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+ "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+ "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+ lock->ml.type, lock->ml.convert_type, lock->ml.node,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ atomic_read(&lock->lock_refs.refcount),
+ (list_empty(&lock->ast_list) ? 'y' : 'n'),
+ (lock->ast_pending ? 'y' : 'n'),
+ (list_empty(&lock->bast_list) ? 'y' : 'n'),
+ (lock->bast_pending ? 'y' : 'n'),
+ (lock->convert_pending ? 'y' : 'n'),
+ (lock->lock_pending ? 'y' : 'n'),
+ (lock->cancel_pending ? 'y' : 'n'),
+ (lock->unlock_pending ? 'y' : 'n'));
+
+ spin_unlock(&lock->spinlock);
}
-EXPORT_SYMBOL_GPL(dlm_print_one_lock);
-#if 0
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
+void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
- struct dlm_lock_resource *res;
- struct hlist_node *iter;
- struct hlist_head *bucket;
- int i;
+ struct dlm_lock *lock;
+ char buf[DLM_LOCKID_NAME_MAX];
- mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
- dlm->name, dlm->node_num, dlm->key);
- if (!dlm || !dlm->name) {
- mlog(ML_ERROR, "dlm=%p\n", dlm);
- return;
- }
+ assert_spin_locked(&res->spinlock);
- spin_lock(&dlm->spinlock);
- for (i=0; i<DLM_HASH_BUCKETS; i++) {
- bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, iter, bucket, hash_node)
- dlm_print_one_lock_resource(res);
+ stringify_lockname(res->lockname.name, res->lockname.len,
+ buf, sizeof(buf));
+ printk("lockres: %s, owner=%u, state=%u\n",
+ buf, res->owner, res->state);
+ printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
+ res->last_used, atomic_read(&res->refs.refcount),
+ list_empty(&res->purge) ? "no" : "yes");
+ printk(" on dirty list: %s, on reco list: %s, "
+ "migrating pending: %s\n",
+ list_empty(&res->dirty) ? "no" : "yes",
+ list_empty(&res->recovering) ? "no" : "yes",
+ res->migration_pending ? "yes" : "no");
+ printk(" inflight locks: %d, asts reserved: %d\n",
+ res->inflight_locks, atomic_read(&res->asts_reserved));
+ dlm_print_lockres_refmap(res);
+ printk(" granted queue:\n");
+ list_for_each_entry(lock, &res->granted, list) {
+ __dlm_print_lock(lock);
}
- spin_unlock(&dlm->spinlock);
+ printk(" converting queue:\n");
+ list_for_each_entry(lock, &res->converting, list) {
+ __dlm_print_lock(lock);
+ }
+ printk(" blocked queue:\n");
+ list_for_each_entry(lock, &res->blocked, list) {
+ __dlm_print_lock(lock);
+ }
+}
+
+void dlm_print_one_lock(struct dlm_lock *lockid)
+{
+ dlm_print_one_lock_resource(lockid->lockres);
}
-#endif /* 0 */
+EXPORT_SYMBOL_GPL(dlm_print_one_lock);
static const char *dlm_errnames[] = {
[DLM_NORMAL] = "DLM_NORMAL",
@@ -248,3 +241,771 @@ const char *dlm_errname(enum dlm_status err)
return dlm_errnames[err];
}
EXPORT_SYMBOL_GPL(dlm_errname);
+
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+static int stringify_lockname(const char *lockname, int locklen, char *buf,
+ int len)
+{
+ int out = 0;
+ __be64 inode_blkno_be;
+
+#define OCFS2_DENTRY_LOCK_INO_START 18
+ if (*lockname == 'N') {
+ memcpy((__be64 *)&inode_blkno_be,
+ (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+ sizeof(__be64));
+ out += snprintf(buf + out, len - out, "%.*s%08x",
+ OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+ (unsigned int)be64_to_cpu(inode_blkno_be));
+ } else
+ out += snprintf(buf + out, len - out, "%.*s",
+ locklen, lockname);
+ return out;
+}
+
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+ char *buf, int len)
+{
+ int out = 0;
+ int i = -1;
+
+ while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+ out += snprintf(buf + out, len - out, "%d ", i);
+
+ return out;
+}
+
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+ int out = 0;
+ char *mle_type;
+
+ if (mle->type == DLM_MLE_BLOCK)
+ mle_type = "BLK";
+ else if (mle->type == DLM_MLE_MASTER)
+ mle_type = "MAS";
+ else
+ mle_type = "MIG";
+
+ out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
+ out += snprintf(buf + out, len - out,
+ "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+ mle_type, mle->master, mle->new_master,
+ !list_empty(&mle->hb_events),
+ !!mle->inuse,
+ atomic_read(&mle->mle_refs.refcount));
+
+ out += snprintf(buf + out, len - out, "Maybe=");
+ out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "Vote=");
+ out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "Response=");
+ out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "Node=");
+ out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ out += snprintf(buf + out, len - out, "\n");
+
+ return out;
+}
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+ char *buf;
+
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (buf) {
+ dump_mle(mle, buf, PAGE_SIZE - 1);
+ free_page((unsigned long)buf);
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root;
+
+#define DLM_DEBUGFS_DIR "o2dlm"
+#define DLM_DEBUGFS_DLM_STATE "dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE "locking_state"
+#define DLM_DEBUGFS_MLE_STATE "mle_state"
+#define DLM_DEBUGFS_PURGE_LIST "purge_list"
+
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+ struct dlm_debug_ctxt *dc;
+
+ dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+
+ kfree(dc);
+}
+
+static void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+ if (dc)
+ kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+ kref_get(&dc->debug_refcnt);
+}
+
+static int debug_release(struct inode *inode, struct file *file)
+{
+ free_page((unsigned long)file->private_data);
+ return 0;
+}
+
+static ssize_t debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+/* end - util funcs */
+
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+ struct dlm_lock_resource *res;
+ int out = 0;
+ unsigned long total = 0;
+
+ out += snprintf(buf + out, len - out,
+ "Dumping Purgelist for Domain: %s\n", dlm->name);
+
+ spin_lock(&dlm->spinlock);
+ list_for_each_entry(res, &dlm->purge_list, purge) {
+ ++total;
+ if (len - out < 100)
+ continue;
+ spin_lock(&res->spinlock);
+ out += stringify_lockname(res->lockname.name,
+ res->lockname.len,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\t%ld\n",
+ (jiffies - res->last_used)/HZ);
+ spin_unlock(&res->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+
+ out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+
+ return out;
+}
+
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ char *buf = NULL;
+
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
+ goto bail;
+
+ i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
+
+ file->private_data = buf;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static const struct file_operations debug_purgelist_fops = {
+ .open = debug_purgelist_open,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
+};
+/* end - purge list funcs */
+
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+ struct dlm_master_list_entry *mle;
+ struct hlist_head *bucket;
+ int i, out = 0;
+ unsigned long total = 0, longest = 0, bucket_count = 0;
+
+ out += snprintf(buf + out, len - out,
+ "Dumping MLEs for Domain: %s\n", dlm->name);
+
+ spin_lock(&dlm->master_lock);
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = dlm_master_hash(dlm, i);
+ hlist_for_each_entry(mle, bucket, master_hash_node) {
+ ++total;
+ ++bucket_count;
+ if (len - out < 200)
+ continue;
+ out += dump_mle(mle, buf + out, len - out);
+ }
+ longest = max(longest, bucket_count);
+ bucket_count = 0;
+ }
+ spin_unlock(&dlm->master_lock);
+
+ out += snprintf(buf + out, len - out,
+ "Total: %ld, Longest: %ld\n", total, longest);
+ return out;
+}
+
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ char *buf = NULL;
+
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
+ goto bail;
+
+ i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
+
+ file->private_data = buf;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static const struct file_operations debug_mle_fops = {
+ .open = debug_mle_open,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
+};
+
+/* end - debug mle funcs */
+
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+ int out;
+
+#define DEBUG_LOCK_VERSION 1
+ spin_lock(&lock->spinlock);
+ out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+ "%d,%d,%d,%d\n",
+ DEBUG_LOCK_VERSION,
+ list_type, lock->ml.type, lock->ml.convert_type,
+ lock->ml.node,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ !list_empty(&lock->ast_list),
+ !list_empty(&lock->bast_list),
+ lock->ast_pending, lock->bast_pending,
+ lock->convert_pending, lock->lock_pending,
+ lock->cancel_pending, lock->unlock_pending,
+ atomic_read(&lock->lock_refs.refcount));
+ spin_unlock(&lock->spinlock);
+
+ return out;
+}
+
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+ struct dlm_lock *lock;
+ int i;
+ int out = 0;
+
+ out += snprintf(buf + out, len - out, "NAME:");
+ out += stringify_lockname(res->lockname.name, res->lockname.len,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+#define DEBUG_LRES_VERSION 1
+ out += snprintf(buf + out, len - out,
+ "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+ DEBUG_LRES_VERSION,
+ res->owner, res->state, res->last_used,
+ !list_empty(&res->purge),
+ !list_empty(&res->dirty),
+ !list_empty(&res->recovering),
+ res->inflight_locks, res->migration_pending,
+ atomic_read(&res->asts_reserved),
+ atomic_read(&res->refs.refcount));
+
+ /* refmap */
+ out += snprintf(buf + out, len - out, "RMAP:");
+ out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* lvb */
+ out += snprintf(buf + out, len - out, "LVBX:");
+ for (i = 0; i < DLM_LVB_LEN; i++)
+ out += snprintf(buf + out, len - out,
+ "%02x", (unsigned char)res->lvb[i]);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* granted */
+ list_for_each_entry(lock, &res->granted, list)
+ out += dump_lock(lock, 0, buf + out, len - out);
+
+ /* converting */
+ list_for_each_entry(lock, &res->converting, list)
+ out += dump_lock(lock, 1, buf + out, len - out);
+
+ /* blocked */
+ list_for_each_entry(lock, &res->blocked, list)
+ out += dump_lock(lock, 2, buf + out, len - out);
+
+ out += snprintf(buf + out, len - out, "\n");
+
+ return out;
+}
+
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct debug_lockres *dl = m->private;
+ struct dlm_ctxt *dlm = dl->dl_ctxt;
+ struct dlm_lock_resource *oldres = dl->dl_res;
+ struct dlm_lock_resource *res = NULL;
+ struct list_head *track_list;
+
+ spin_lock(&dlm->track_lock);
+ if (oldres)
+ track_list = &oldres->tracking;
+ else {
+ track_list = &dlm->tracking_list;
+ if (list_empty(track_list)) {
+ dl = NULL;
+ spin_unlock(&dlm->track_lock);
+ goto bail;
+ }
+ }
+
+ list_for_each_entry(res, track_list, tracking) {
+ if (&res->tracking == &dlm->tracking_list)
+ res = NULL;
+ else
+ dlm_lockres_get(res);
+ break;
+ }
+ spin_unlock(&dlm->track_lock);
+
+ if (oldres)
+ dlm_lockres_put(oldres);
+
+ dl->dl_res = res;
+
+ if (res) {
+ spin_lock(&res->spinlock);
+ dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+ spin_unlock(&res->spinlock);
+ } else
+ dl = NULL;
+
+bail:
+ /* passed to seq_show */
+ return dl;
+}
+
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return NULL;
+}
+
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+ struct debug_lockres *dl = (struct debug_lockres *)v;
+
+ seq_printf(s, "%s", dl->dl_buf);
+
+ return 0;
+}
+
+static const struct seq_operations debug_lockres_ops = {
+ .start = lockres_seq_start,
+ .stop = lockres_seq_stop,
+ .next = lockres_seq_next,
+ .show = lockres_seq_show,
+};
+
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ int ret = -ENOMEM;
+ struct seq_file *seq;
+ struct debug_lockres *dl = NULL;
+
+ dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+ if (!dl) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ dl->dl_len = PAGE_SIZE;
+ dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+ if (!dl->dl_buf) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = seq_open(file, &debug_lockres_ops);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ seq = file->private_data;
+ seq->private = dl;
+
+ dlm_grab(dlm);
+ dl->dl_ctxt = dlm;
+
+ return 0;
+bail:
+ if (dl)
+ kfree(dl->dl_buf);
+ kfree(dl);
+ return ret;
+}
+
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+
+ if (dl->dl_res)
+ dlm_lockres_put(dl->dl_res);
+ dlm_put(dl->dl_ctxt);
+ kfree(dl->dl_buf);
+ return seq_release_private(inode, file);
+}
+
+static const struct file_operations debug_lockres_fops = {
+ .open = debug_lockres_open,
+ .release = debug_lockres_release,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+/* end - debug lockres funcs */
+
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
+{
+ int out = 0;
+ struct dlm_reco_node_data *node;
+ char *state;
+ int cur_mles = 0, tot_mles = 0;
+ int i;
+
+ spin_lock(&dlm->spinlock);
+
+ switch (dlm->dlm_state) {
+ case DLM_CTXT_NEW:
+ state = "NEW"; break;
+ case DLM_CTXT_JOINED:
+ state = "JOINED"; break;
+ case DLM_CTXT_IN_SHUTDOWN:
+ state = "SHUTDOWN"; break;
+ case DLM_CTXT_LEAVING:
+ state = "LEAVING"; break;
+ default:
+ state = "UNKNOWN"; break;
+ }
+
+ /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
+ out += snprintf(buf + out, len - out,
+ "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
+ dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+
+ /* Thread Pid: xxx Node: xxx State: xxxxx */
+ out += snprintf(buf + out, len - out,
+ "Thread Pid: %d Node: %d State: %s\n",
+ task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
+
+ /* Number of Joins: xxx Joining Node: xxx */
+ out += snprintf(buf + out, len - out,
+ "Number of Joins: %d Joining Node: %d\n",
+ dlm->num_joins, dlm->joining_node);
+
+ /* Domain Map: xx xx xx */
+ out += snprintf(buf + out, len - out, "Domain Map: ");
+ out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* Exit Domain Map: xx xx xx */
+ out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+ out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* Live Map: xx xx xx */
+ out += snprintf(buf + out, len - out, "Live Map: ");
+ out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* Lock Resources: xxx (xxx) */
+ out += snprintf(buf + out, len - out,
+ "Lock Resources: %d (%d)\n",
+ atomic_read(&dlm->res_cur_count),
+ atomic_read(&dlm->res_tot_count));
+
+ for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+ tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+
+ for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+ cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+
+ /* MLEs: xxx (xxx) */
+ out += snprintf(buf + out, len - out,
+ "MLEs: %d (%d)\n", cur_mles, tot_mles);
+
+ /* Blocking: xxx (xxx) */
+ out += snprintf(buf + out, len - out,
+ " Blocking: %d (%d)\n",
+ atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+ atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+
+ /* Mastery: xxx (xxx) */
+ out += snprintf(buf + out, len - out,
+ " Mastery: %d (%d)\n",
+ atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+ atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+
+ /* Migration: xxx (xxx) */
+ out += snprintf(buf + out, len - out,
+ " Migration: %d (%d)\n",
+ atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
+ atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
+
+ /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
+ out += snprintf(buf + out, len - out,
+ "Lists: Dirty=%s Purge=%s PendingASTs=%s "
+ "PendingBASTs=%s\n",
+ (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+ (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+ (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+ (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
+
+ /* Purge Count: xxx Refs: xxx */
+ out += snprintf(buf + out, len - out,
+ "Purge Count: %d Refs: %d\n", dlm->purge_count,
+ atomic_read(&dlm->dlm_refs.refcount));
+
+ /* Dead Node: xxx */
+ out += snprintf(buf + out, len - out,
+ "Dead Node: %d\n", dlm->reco.dead_node);
+
+ /* What about DLM_RECO_STATE_FINALIZE? */
+ if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+ state = "ACTIVE";
+ else
+ state = "INACTIVE";
+
+ /* Recovery Pid: xxxx Master: xxx State: xxxx */
+ out += snprintf(buf + out, len - out,
+ "Recovery Pid: %d Master: %d State: %s\n",
+ task_pid_nr(dlm->dlm_reco_thread_task),
+ dlm->reco.new_master, state);
+
+ /* Recovery Map: xx xx */
+ out += snprintf(buf + out, len - out, "Recovery Map: ");
+ out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* Recovery Node State: */
+ out += snprintf(buf + out, len - out, "Recovery Node State:\n");
+ list_for_each_entry(node, &dlm->reco.node_data, list) {
+ switch (node->state) {
+ case DLM_RECO_NODE_DATA_INIT:
+ state = "INIT";
+ break;
+ case DLM_RECO_NODE_DATA_REQUESTING:
+ state = "REQUESTING";
+ break;
+ case DLM_RECO_NODE_DATA_DEAD:
+ state = "DEAD";
+ break;
+ case DLM_RECO_NODE_DATA_RECEIVING:
+ state = "RECEIVING";
+ break;
+ case DLM_RECO_NODE_DATA_REQUESTED:
+ state = "REQUESTED";
+ break;
+ case DLM_RECO_NODE_DATA_DONE:
+ state = "DONE";
+ break;
+ case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+ state = "FINALIZE-SENT";
+ break;
+ default:
+ state = "BAD";
+ break;
+ }
+ out += snprintf(buf + out, len - out, "\t%u - %s\n",
+ node->node_num, state);
+ }
+
+ spin_unlock(&dlm->spinlock);
+
+ return out;
+}
+
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+ struct dlm_ctxt *dlm = inode->i_private;
+ char *buf = NULL;
+
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
+ goto bail;
+
+ i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
+
+ file->private_data = buf;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static const struct file_operations debug_state_fops = {
+ .open = debug_state_open,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
+};
+/* end - debug state funcs */
+
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+ struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+ /* for dumping dlm_ctxt */
+ dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_state_fops);
+ if (!dc->debug_state_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ /* for dumping lockres */
+ dc->debug_lockres_dentry =
+ debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_lockres_fops);
+ if (!dc->debug_lockres_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ /* for dumping mles */
+ dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_mle_fops);
+ if (!dc->debug_mle_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ /* for dumping lockres on the purge list */
+ dc->debug_purgelist_dentry =
+ debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+ S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot,
+ dlm, &debug_purgelist_fops);
+ if (!dc->debug_purgelist_dentry) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ dlm_debug_get(dc);
+ return 0;
+
+bail:
+ dlm_debug_shutdown(dlm);
+ return -ENOMEM;
+}
+
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+ struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+ if (dc) {
+ debugfs_remove(dc->debug_purgelist_dentry);
+ debugfs_remove(dc->debug_mle_dentry);
+ debugfs_remove(dc->debug_lockres_dentry);
+ debugfs_remove(dc->debug_state_dentry);
+ dlm_debug_put(dc);
+ }
+}
+
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+ dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+ dlm_debugfs_root);
+ if (!dlm->dlm_debugfs_subroot) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+
+ dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+ GFP_KERNEL);
+ if (!dlm->dlm_debug_ctxt) {
+ mlog_errno(-ENOMEM);
+ goto bail;
+ }
+ kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+
+ return 0;
+bail:
+ dlm_destroy_debugfs_subroot(dlm);
+ return -ENOMEM;
+}
+
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+ debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+ dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+ if (!dlm_debugfs_root) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void dlm_destroy_debugfs_root(void)
+{
+ debugfs_remove(dlm_debugfs_root);
+}
+#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 00000000000..1f27c4812d1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct dlm_debug_ctxt {
+ struct kref debug_refcnt;
+ struct dentry *debug_state_dentry;
+ struct dentry *debug_lockres_dentry;
+ struct dentry *debug_mle_dentry;
+ struct dentry *debug_purgelist_dentry;
+};
+
+struct debug_lockres {
+ int dl_len;
+ char *dl_buf;
+ struct dlm_ctxt *dl_ctxt;
+ struct dlm_lock_resource *dl_res;
+};
+
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+
+#else
+
+static inline int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+ return 0;
+}
+static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+ return 0;
+}
+static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static inline int dlm_create_debugfs_root(void)
+{
+ return 0;
+}
+static inline void dlm_destroy_debugfs_root(void)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+#endif /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8d1065f8b3b..39efc5057a3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,11 +28,11 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include <linux/err.h>
+#include <linux/debugfs.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
@@ -40,14 +40,42 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-
#include "dlmdomain.h"
-
-#include "dlmver.h"
+#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
+/*
+ * ocfs2 node maps are array of long int, which limits to send them freely
+ * across the wire due to endianness issues. To workaround this, we convert
+ * long ints to byte arrays. Following 3 routines are helper functions to
+ * set/test/copy bits within those array of bytes
+ */
+static inline void byte_set_bit(u8 nr, u8 map[])
+{
+ map[nr >> 3] |= (1UL << (nr & 7));
+}
+
+static inline int byte_test_bit(u8 nr, u8 map[])
+{
+ return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
+}
+
+static inline void byte_copymap(u8 dmap[], unsigned long smap[],
+ unsigned int sz)
+{
+ unsigned int nn;
+
+ if (!sz)
+ return;
+
+ memset(dmap, 0, ((sz + 7) >> 3));
+ for (nn = 0 ; nn < sz; nn++)
+ if (test_bit(nn, smap))
+ byte_set_bit(nn, dmap);
+}
+
static void dlm_free_pagevec(void **vec, int pages)
{
while (pages--)
@@ -68,7 +96,8 @@ static void **dlm_alloc_pagevec(int pages)
goto out_free;
mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
- pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
+ pages, (unsigned long)DLM_HASH_PAGES,
+ (unsigned long)DLM_BUCKETS_PER_PAGE);
return vec;
out_free:
dlm_free_pagevec(vec, i);
@@ -92,23 +121,52 @@ DEFINE_SPINLOCK(dlm_domain_lock);
LIST_HEAD(dlm_domains);
static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
+/*
+ * The supported protocol version for DLM communication. Running domains
+ * will have a negotiated version with the same major number and a minor
+ * number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
+ * be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ * - Message DLM_QUERY_REGION added to support global heartbeat
+ * - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ */
+static const struct dlm_protocol_version dlm_protocol = {
+ .pv_major = 1,
+ .pv_minor = 2,
+};
+
#define DLM_DOMAIN_BACKOFF_MS 200
-static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
-static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
-static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
-static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data);
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+ struct dlm_protocol_version *request);
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
+ if (hlist_unhashed(&res->hash_node))
+ return;
+
+ mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ hlist_del_init(&res->hash_node);
+ dlm_lockres_put(res);
}
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
struct qstr *q;
@@ -122,25 +180,26 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
+
+ mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
-struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
- const char *name,
- unsigned int len,
- unsigned int hash)
+struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
+ const char *name,
+ unsigned int len,
+ unsigned int hash)
{
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct dlm_lock_resource *res;
- mlog_entry("%.*s\n", len, name);
+ mlog(0, "%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
bucket = dlm_lockres_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- struct dlm_lock_resource *res = hlist_entry(list,
- struct dlm_lock_resource, hash_node);
+ hlist_for_each_entry(res, bucket, hash_node) {
if (res->lockname.name[0] != name[0])
continue;
if (unlikely(res->lockname.len != len))
@@ -153,6 +212,37 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
return NULL;
}
+/* intended to be called by functions which do not care about lock
+ * resources which are being purged (most net _handler functions).
+ * this will return NULL for any lock resource which is found but
+ * currently in the process of dropping its mastery reference.
+ * use __dlm_lookup_lockres_full when you need the lock resource
+ * regardless (e.g. dlm_get_lock_resource) */
+struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
+ const char *name,
+ unsigned int len,
+ unsigned int hash)
+{
+ struct dlm_lock_resource *res = NULL;
+
+ mlog(0, "%.*s\n", len, name);
+
+ assert_spin_locked(&dlm->spinlock);
+
+ res = __dlm_lookup_lockres_full(dlm, name, len, hash);
+ if (res) {
+ spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
+ return NULL;
+ }
+ spin_unlock(&res->spinlock);
+ }
+
+ return res;
+}
+
struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int len)
@@ -168,22 +258,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
{
- struct dlm_ctxt *tmp = NULL;
- struct list_head *iter;
+ struct dlm_ctxt *tmp;
assert_spin_locked(&dlm_domain_lock);
/* tmp->name here is always NULL terminated,
* but domain may not be! */
- list_for_each(iter, &dlm_domains) {
- tmp = list_entry (iter, struct dlm_ctxt, list);
+ list_for_each_entry(tmp, &dlm_domains, list) {
if (strlen(tmp->name) == len &&
memcmp(tmp->name, domain, len)==0)
- break;
- tmp = NULL;
+ return tmp;
}
- return tmp;
+ return NULL;
}
/* For null terminated domain strings ONLY */
@@ -217,12 +304,15 @@ static int dlm_wait_on_domain_helper(const char *domain)
static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
{
+ dlm_destroy_debugfs_subroot(dlm);
+
if (dlm->lockres_hash)
dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
- if (dlm->name)
- kfree(dlm->name);
+ if (dlm->master_hash)
+ dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+ kfree(dlm->name);
kfree(dlm);
}
@@ -269,25 +359,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
* you shouldn't trust your pointer. */
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
{
- struct list_head *iter;
- struct dlm_ctxt *target = NULL;
+ struct dlm_ctxt *target;
+ struct dlm_ctxt *ret = NULL;
spin_lock(&dlm_domain_lock);
- list_for_each(iter, &dlm_domains) {
- target = list_entry (iter, struct dlm_ctxt, list);
-
+ list_for_each_entry(target, &dlm_domains, list) {
if (target == dlm) {
__dlm_get(target);
+ ret = target;
break;
}
-
- target = NULL;
}
spin_unlock(&dlm_domain_lock);
- return target;
+ return ret;
}
int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -314,6 +401,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
+ dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -329,43 +417,61 @@ static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
wake_up(&dlm_domain_events);
}
-static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
+static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
{
- int i;
+ int i, num, n, ret = 0;
struct dlm_lock_resource *res;
+ struct hlist_node *iter;
+ struct hlist_head *bucket;
+ int dropped;
mlog(0, "Migrating locks from domain %s\n", dlm->name);
-restart:
+
+ num = 0;
spin_lock(&dlm->spinlock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
- while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
- res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
- struct dlm_lock_resource, hash_node);
- /* need reference when manually grabbing lockres */
+redo_bucket:
+ n = 0;
+ bucket = dlm_lockres_hash(dlm, i);
+ iter = bucket->first;
+ while (iter) {
+ n++;
+ res = hlist_entry(iter, struct dlm_lock_resource,
+ hash_node);
dlm_lockres_get(res);
- /* this should unhash the lockres
- * and exit with dlm->spinlock */
- mlog(0, "purging res=%p\n", res);
- if (dlm_lockres_is_dirty(dlm, res)) {
- /* HACK! this should absolutely go.
- * need to figure out why some empty
- * lockreses are still marked dirty */
- mlog(ML_ERROR, "lockres %.*s dirty!\n",
- res->lockname.len, res->lockname.name);
-
- spin_unlock(&dlm->spinlock);
- dlm_kick_thread(dlm, res);
- wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
- dlm_lockres_put(res);
- goto restart;
- }
- dlm_purge_lockres(dlm, res);
+ /* migrate, if necessary. this will drop the dlm
+ * spinlock and retake it if it does migration. */
+ dropped = dlm_empty_lockres(dlm, res);
+
+ spin_lock(&res->spinlock);
+ if (dropped)
+ __dlm_lockres_calc_usage(dlm, res);
+ else
+ iter = res->hash_node.next;
+ spin_unlock(&res->spinlock);
+
dlm_lockres_put(res);
+
+ if (dropped) {
+ cond_resched_lock(&dlm->spinlock);
+ goto redo_bucket;
+ }
}
+ cond_resched_lock(&dlm->spinlock);
+ num += n;
}
spin_unlock(&dlm->spinlock);
-
+ wake_up(&dlm->dlm_thread_wq);
+
+ /* let the dlm thread take care of purging, keep scanning until
+ * nothing remains in the hash */
+ if (num) {
+ mlog(0, "%s: %d lock resources in hash last pass\n",
+ dlm->name, num);
+ ret = -EAGAIN;
+ }
mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
+ return ret;
}
static int dlm_no_joining_node(struct dlm_ctxt *dlm)
@@ -379,6 +485,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
return ret;
}
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ unsigned int node;
+ struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ node = exit_msg->node_idx;
+ mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+ spin_lock(&dlm->spinlock);
+ set_bit(node, dlm->exit_domain_map);
+ spin_unlock(&dlm->spinlock);
+
+ dlm_put(dlm);
+
+ return 0;
+}
+
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -404,36 +532,37 @@ again:
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
- int node = -1;
+ int node = -1, num = 0;
assert_spin_locked(&dlm->spinlock);
- printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
-
+ printk("( ");
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
printk("%d ", node);
+ ++num;
}
- printk("\n");
+ printk(") %u nodes\n", num);
}
-static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
- mlog_entry("%p %u %p", msg, len, data);
+ mlog(0, "%p %u %p", msg, len, data);
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
- printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
-
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
+ clear_bit(node, dlm->exit_domain_map);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@@ -446,27 +575,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
return 0;
}
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
- mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
- node, dlm->name, dlm->node_num);
+ mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+ msg_type, node);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
- status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
- &leave_msg, sizeof(leave_msg), node,
- NULL);
-
- mlog(0, "status return %d from o2net_send_message\n", status);
+ status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+ sizeof(leave_msg), node, NULL);
+ if (status < 0)
+ mlog(ML_ERROR, "Error %d sending domain exit message %u "
+ "to node %u on domain %s\n", status, msg_type, node,
+ dlm->name);
return status;
}
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+ int node = -1;
+
+ /* Support for begin exit domain was added in 1.2 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor < 2)
+ return;
+
+ /*
+ * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+ * informational. Meaning if a node does not receive the message,
+ * so be it.
+ */
+ spin_lock(&dlm->spinlock);
+ while (1) {
+ node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+ if (node >= O2NM_MAX_NODES)
+ break;
+ if (node == dlm->node_num)
+ continue;
+
+ spin_unlock(&dlm->spinlock);
+ dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+ spin_lock(&dlm->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
@@ -492,7 +650,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
clear_node = 1;
- status = dlm_send_one_domain_exit(dlm, node);
+ status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+ node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
@@ -546,6 +705,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
void dlm_unregister_domain(struct dlm_ctxt *dlm)
{
int leave = 0;
+ struct dlm_lock_resource *res;
spin_lock(&dlm_domain_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -566,24 +726,108 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
+ dlm_begin_exit_domain(dlm);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
- dlm_migrate_all_locks(dlm);
+ while (dlm_migrate_all_locks(dlm)) {
+ /* Give dlm_thread time to purge the lockres' */
+ msleep(500);
+ mlog(0, "%s: more migration to do\n", dlm->name);
+ }
+
+ /* This list should be empty. If not, print remaining lockres */
+ if (!list_empty(&dlm->tracking_list)) {
+ mlog(ML_ERROR, "Following lockres' are still on the "
+ "tracking list:\n");
+ list_for_each_entry(res, &dlm->tracking_list, tracking)
+ dlm_print_one_lock_resource(res);
+ }
+
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
+ dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
dlm_put(dlm);
}
EXPORT_SYMBOL_GPL(dlm_unregister_domain);
-static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_query_join_proto_check(char *proto_type, int node,
+ struct dlm_protocol_version *ours,
+ struct dlm_protocol_version *request)
+{
+ int rc;
+ struct dlm_protocol_version proto = *request;
+
+ if (!dlm_protocol_compare(ours, &proto)) {
+ mlog(0,
+ "node %u wanted to join with %s locking protocol "
+ "%u.%u, we respond with %u.%u\n",
+ node, proto_type,
+ request->pv_major,
+ request->pv_minor,
+ proto.pv_major, proto.pv_minor);
+ request->pv_minor = proto.pv_minor;
+ rc = 0;
+ } else {
+ mlog(ML_NOTICE,
+ "Node %u wanted to join with %s locking "
+ "protocol %u.%u, but we have %u.%u, disallowing\n",
+ node, proto_type,
+ request->pv_major,
+ request->pv_minor,
+ ours->pv_major,
+ ours->pv_minor);
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/*
+ * struct dlm_query_join_packet is made up of four one-byte fields. They
+ * are effectively in big-endian order already. However, little-endian
+ * machines swap them before putting the packet on the wire (because
+ * query_join's response is a status, and that status is treated as a u32
+ * on the wire). Thus, a big-endian and little-endian machines will treat
+ * this structure differently.
+ *
+ * The solution is to have little-endian machines swap the structure when
+ * converting from the structure to the u32 representation. This will
+ * result in the structure having the correct format on the wire no matter
+ * the host endian format.
+ */
+static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
+ u32 *wire)
+{
+ union dlm_query_join_response response;
+
+ response.packet = *packet;
+ *wire = be32_to_cpu(response.intval);
+}
+
+static void dlm_query_join_wire_to_packet(u32 wire,
+ struct dlm_query_join_packet *packet)
+{
+ union dlm_query_join_response response;
+
+ response.intval = cpu_to_be32(wire);
+ *packet = response.packet;
+}
+
+static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_query_join_request *query;
- enum dlm_query_join_response response;
+ struct dlm_query_join_packet packet = {
+ .code = JOIN_DISALLOW,
+ };
struct dlm_ctxt *dlm = NULL;
+ u32 response;
+ u8 nodenum;
query = (struct dlm_query_join_request *) msg->buf;
@@ -599,16 +843,38 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
mlog(0, "node %u is not in our live map yet\n",
query->node_idx);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
goto respond;
}
- response = JOIN_OK_NO_MAP;
+ packet.code = JOIN_OK_NO_MAP;
spin_lock(&dlm_domain_lock);
dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+ if (!dlm)
+ goto unlock_respond;
+
+ /*
+ * There is a small window where the joining node may not see the
+ * node(s) that just left but still part of the cluster. DISALLOW
+ * join request if joining node has different node map.
+ */
+ nodenum=0;
+ while (nodenum < O2NM_MAX_NODES) {
+ if (test_bit(nodenum, dlm->domain_map)) {
+ if (!byte_test_bit(nodenum, query->node_map)) {
+ mlog(0, "disallow join as node %u does not "
+ "have node %u in its nodemap\n",
+ query->node_idx, nodenum);
+ packet.code = JOIN_DISALLOW;
+ goto unlock_respond;
+ }
+ }
+ nodenum++;
+ }
+
/* Once the dlm ctxt is marked as leaving then we don't want
- * to be put in someone's domain map.
+ * to be put in someone's domain map.
* Also, explicitly disallow joining at certain troublesome
* times (ie. during recovery). */
if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
@@ -620,43 +886,60 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
/*If this is a brand new context and we
* haven't started our join process yet, then
* the other node won the race. */
- response = JOIN_OK_NO_MAP;
+ packet.code = JOIN_OK_NO_MAP;
} else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
/* Disallow parallel joins. */
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
- mlog(ML_NOTICE, "node %u trying to join, but recovery "
+ mlog(0, "node %u trying to join, but recovery "
"is ongoing.\n", bit);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else if (test_bit(bit, dlm->recovery_map)) {
- mlog(ML_NOTICE, "node %u trying to join, but it "
+ mlog(0, "node %u trying to join, but it "
"still needs recovery.\n", bit);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else if (test_bit(bit, dlm->domain_map)) {
- mlog(ML_NOTICE, "node %u trying to join, but it "
+ mlog(0, "node %u trying to join, but it "
"is still in the domain! needs recovery?\n",
bit);
- response = JOIN_DISALLOW;
+ packet.code = JOIN_DISALLOW;
} else {
/* Alright we're fully a part of this domain
* so we keep some state as to who's joining
* and indicate to him that needs to be fixed
* up. */
- response = JOIN_OK;
- __dlm_set_joining_node(dlm, query->node_idx);
+
+ /* Make sure we speak compatible locking protocols. */
+ if (dlm_query_join_proto_check("DLM", bit,
+ &dlm->dlm_locking_proto,
+ &query->dlm_proto)) {
+ packet.code = JOIN_PROTOCOL_MISMATCH;
+ } else if (dlm_query_join_proto_check("fs", bit,
+ &dlm->fs_locking_proto,
+ &query->fs_proto)) {
+ packet.code = JOIN_PROTOCOL_MISMATCH;
+ } else {
+ packet.dlm_minor = query->dlm_proto.pv_minor;
+ packet.fs_minor = query->fs_proto.pv_minor;
+ packet.code = JOIN_OK;
+ __dlm_set_joining_node(dlm, query->node_idx);
+ }
}
spin_unlock(&dlm->spinlock);
}
+unlock_respond:
spin_unlock(&dlm_domain_lock);
respond:
- mlog(0, "We respond with %u\n", response);
+ mlog(0, "We respond with %u\n", packet.code);
+ dlm_query_join_packet_to_wire(&packet, &response);
return response;
}
-static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_assert_joined *assert;
struct dlm_ctxt *dlm = NULL;
@@ -676,10 +959,19 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
* domain. Set him in the map and clean up our
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
+
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "dlm recovery is ongoing, disallow join\n");
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+ return -EAGAIN;
+ }
+
set_bit(assert->node_idx, dlm->domain_map);
+ clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -693,7 +985,373 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
return 0;
}
-static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+ struct dlm_query_region *qr,
+ char *local, int locallen)
+{
+ char *remote = qr->qr_regions;
+ char *l, *r;
+ int localnr, i, j, foundit;
+ int status = 0;
+
+ if (!o2hb_global_heartbeat_active()) {
+ if (qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+ "heartbeat enabled but local node %d does not\n",
+ qr->qr_domain, qr->qr_node, dlm->node_num);
+ status = -EINVAL;
+ }
+ goto bail;
+ }
+
+ if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Local node %d has global "
+ "heartbeat enabled but joining node %d does not\n",
+ qr->qr_domain, dlm->node_num, qr->qr_node);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
+ localnr = o2hb_get_all_regions(local, (u8)localnr);
+
+ /* compare local regions with remote */
+ l = local;
+ for (i = 0; i < localnr; ++i) {
+ foundit = 0;
+ r = remote;
+ for (j = 0; j <= qr->qr_numregions; ++j) {
+ if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in local node %d but not in joining node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+ dlm->node_num, qr->qr_node);
+ goto bail;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ /* compare remote with local regions */
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ foundit = 0;
+ l = local;
+ for (j = 0; j < localnr; ++j) {
+ if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in joining node %d but not in local node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+ qr->qr_node, dlm->node_num);
+ goto bail;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+bail:
+ return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_region *qr = NULL;
+ int status, ret = 0, i;
+ char *p;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+ if (!qr) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ qr->qr_node = dlm->node_num;
+ qr->qr_namelen = strlen(dlm->name);
+ memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+ /* if local hb, the numregions will be zero */
+ if (o2hb_global_heartbeat_active())
+ qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+ O2NM_MAX_REGIONS);
+
+ p = qr->qr_regions;
+ for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending regions to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+ sizeof(struct dlm_query_region),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+ ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qr);
+ return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_region *qr;
+ struct dlm_ctxt *dlm = NULL;
+ char *local = NULL;
+ int status = 0;
+
+ qr = (struct dlm_query_region *) msg->buf;
+
+ mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+ qr->qr_domain);
+
+ /* buffer used in dlm_mast_regions() */
+ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+ if (!local)
+ return -ENOMEM;
+
+ status = -EINVAL;
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "before join domain\n", qr->qr_node, qr->qr_domain);
+ goto out_domain_lock;
+ }
+
+ spin_lock(&dlm->spinlock);
+ if (dlm->joining_node != qr->qr_node) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+ dlm->joining_node);
+ goto out_dlm_lock;
+ }
+
+ /* Support for global heartbeat was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but active dlm protocol is %d.%d\n", qr->qr_node,
+ qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto out_dlm_lock;
+ }
+
+ status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
+
+out_dlm_lock:
+ spin_unlock(&dlm->spinlock);
+
+out_domain_lock:
+ spin_unlock(&dlm_domain_lock);
+
+ kfree(local);
+
+ return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+ struct o2nm_node *local;
+ struct dlm_node_info *remote;
+ int i, j;
+ int status = 0;
+
+ for (j = 0; j < qn->qn_numnodes; ++j)
+ mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+ &(qn->qn_nodes[j].ni_ipv4_address),
+ ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+ for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+ local = o2nm_get_node_by_num(i);
+ remote = NULL;
+ for (j = 0; j < qn->qn_numnodes; ++j) {
+ if (qn->qn_nodes[j].ni_nodenum == i) {
+ remote = &(qn->qn_nodes[j]);
+ break;
+ }
+ }
+
+ if (!local && !remote)
+ continue;
+
+ if ((local && !remote) || (!local && remote))
+ status = -EINVAL;
+
+ if (!status &&
+ ((remote->ni_nodenum != local->nd_num) ||
+ (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+ (remote->ni_ipv4_address != local->nd_ipv4_address)))
+ status = -EINVAL;
+
+ if (status) {
+ if (remote && !local)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in joining node %d but not in "
+ "local node %d\n", qn->qn_domain,
+ remote->ni_nodenum,
+ &(remote->ni_ipv4_address),
+ ntohs(remote->ni_ipv4_port),
+ qn->qn_nodenum, dlm->node_num);
+ if (local && !remote)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in local node %d but not in "
+ "joining node %d\n", qn->qn_domain,
+ local->nd_num, &(local->nd_ipv4_address),
+ ntohs(local->nd_ipv4_port),
+ dlm->node_num, qn->qn_nodenum);
+ BUG_ON((!local && !remote));
+ }
+
+ if (local)
+ o2nm_node_put(local);
+ }
+
+ return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_nodeinfo *qn = NULL;
+ struct o2nm_node *node;
+ int ret = 0, status, count, i;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+ if (!qn) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+ node = o2nm_get_node_by_num(i);
+ if (!node)
+ continue;
+ qn->qn_nodes[count].ni_nodenum = node->nd_num;
+ qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+ qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+ mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+ &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+ ++count;
+ o2nm_node_put(node);
+ }
+
+ qn->qn_nodenum = dlm->node_num;
+ qn->qn_numnodes = count;
+ qn->qn_namelen = strlen(dlm->name);
+ memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending nodeinfo to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ qn, sizeof(struct dlm_query_nodeinfo),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qn);
+ return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_nodeinfo *qn;
+ struct dlm_ctxt *dlm = NULL;
+ int locked = 0, status = -EINVAL;
+
+ qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+ mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+ qn->qn_domain);
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+ "join domain\n", qn->qn_nodenum, qn->qn_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qn->qn_nodenum) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+ "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for node query was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+ "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+ qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_nodes(dlm, qn);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
+static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_cancel_join *cancel;
struct dlm_ctxt *dlm = NULL;
@@ -736,7 +1394,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
&cancel_msg, sizeof(cancel_msg), node,
NULL);
if (status < 0) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
@@ -756,7 +1416,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
sizeof(unsigned long))) {
mlog(ML_ERROR,
"map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
- map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
+ map_size, (unsigned)BITS_TO_LONGS(O2NM_MAX_NODES));
return -EINVAL;
}
@@ -783,10 +1443,12 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
static int dlm_request_join(struct dlm_ctxt *dlm,
int node,
- enum dlm_query_join_response *response)
+ enum dlm_query_join_response_code *response)
{
- int status, retval;
+ int status;
struct dlm_query_join_request join_msg;
+ struct dlm_query_join_packet packet;
+ u32 join_resp;
mlog(0, "querying node %d\n", node);
@@ -794,13 +1456,21 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
join_msg.node_idx = dlm->node_num;
join_msg.name_len = strlen(dlm->name);
memcpy(join_msg.domain, dlm->name, join_msg.name_len);
+ join_msg.dlm_proto = dlm->dlm_locking_proto;
+ join_msg.fs_proto = dlm->fs_locking_proto;
+
+ /* copy live node map to join message */
+ byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
- sizeof(join_msg), node, &retval);
+ sizeof(join_msg), node, &join_resp);
if (status < 0 && status != -ENOPROTOOPT) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
+ dlm_query_join_wire_to_packet(join_resp, &packet);
/* -ENOPROTOOPT from the net code means the other side isn't
listening for our message type -- that's fine, it means
@@ -809,18 +1479,43 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
if (status == -ENOPROTOOPT) {
status = 0;
*response = JOIN_OK_NO_MAP;
- } else if (retval == JOIN_DISALLOW ||
- retval == JOIN_OK ||
- retval == JOIN_OK_NO_MAP) {
- *response = retval;
+ } else if (packet.code == JOIN_DISALLOW ||
+ packet.code == JOIN_OK_NO_MAP) {
+ *response = packet.code;
+ } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
+ mlog(ML_NOTICE,
+ "This node requested DLM locking protocol %u.%u and "
+ "filesystem locking protocol %u.%u. At least one of "
+ "the protocol versions on node %d is not compatible, "
+ "disconnecting\n",
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor,
+ node);
+ status = -EPROTO;
+ *response = packet.code;
+ } else if (packet.code == JOIN_OK) {
+ *response = packet.code;
+ /* Use the same locking protocol as the remote node */
+ dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+ dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+ mlog(0,
+ "Node %d responds JOIN_OK with DLM locking protocol "
+ "%u.%u and fs locking protocol %u.%u\n",
+ node,
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor);
} else {
status = -EINVAL;
- mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
- node);
+ mlog(ML_ERROR, "invalid response %d from node %u\n",
+ packet.code, node);
}
mlog(0, "status %d, node %d response is %d\n", status, node,
- *response);
+ *response);
bail:
return status;
@@ -830,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
+ int ret;
struct dlm_assert_joined assert_msg;
mlog(0, "Sending join assert to node %u\n", node);
@@ -841,9 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
&assert_msg, sizeof(assert_msg), node,
- NULL);
+ &ret);
if (status < 0)
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+ node);
+ else
+ status = ret;
return status;
}
@@ -889,7 +1589,7 @@ struct domain_join_ctxt {
static int dlm_should_restart_join(struct dlm_ctxt *dlm,
struct domain_join_ctxt *ctxt,
- enum dlm_query_join_response response)
+ enum dlm_query_join_response_code response)
{
int ret;
@@ -915,11 +1615,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
{
int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt;
- enum dlm_query_join_response response;
+ enum dlm_query_join_response_code response = JOIN_DISALLOW;
- mlog_entry("%p", dlm);
+ mlog(0, "%p", dlm);
- ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
+ ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (!ctxt) {
status = -ENOMEM;
mlog_errno(status);
@@ -973,6 +1673,21 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
+ /* Support for global heartbeat and node info was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major > 1 ||
+ dlm->dlm_locking_proto.pv_minor > 0) {
+ status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
/* Joined state *must* be set before the joining node
@@ -987,8 +1702,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
bail:
spin_lock(&dlm->spinlock);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- if (!status)
+ if (!status) {
+ printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
__dlm_print_nodes(dlm);
+ }
spin_unlock(&dlm->spinlock);
if (ctxt) {
@@ -1009,8 +1726,8 @@ bail:
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
{
- o2hb_unregister_callback(&dlm->dlm_hb_up);
- o2hb_unregister_callback(&dlm->dlm_hb_down);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
}
@@ -1022,111 +1739,126 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
- status = o2hb_register_callback(&dlm->dlm_hb_down);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
- status = o2hb_register_callback(&dlm->dlm_hb_up);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
sizeof(struct dlm_master_request),
dlm_master_request_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
sizeof(struct dlm_assert_master),
dlm_assert_master_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, dlm_assert_master_post_handler,
+ &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
sizeof(struct dlm_create_lock),
dlm_create_lock_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
DLM_CONVERT_LOCK_MAX_LEN,
dlm_convert_lock_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
DLM_UNLOCK_LOCK_MAX_LEN,
dlm_unlock_lock_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
DLM_PROXY_AST_MAX_LEN,
dlm_proxy_ast_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
sizeof(struct dlm_exit_domain),
dlm_exit_domain_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key,
+ sizeof(struct dlm_deref_lockres),
+ dlm_deref_lockres_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
sizeof(struct dlm_migrate_request),
dlm_migrate_request_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
DLM_MIG_LOCKRES_MAX_LEN,
dlm_mig_lockres_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
sizeof(struct dlm_master_requery),
dlm_master_requery_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
sizeof(struct dlm_lock_request),
dlm_request_all_locks_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
sizeof(struct dlm_reco_data_done),
dlm_reco_data_done_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
sizeof(struct dlm_begin_reco),
dlm_begin_reco_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
sizeof(struct dlm_finalize_reco),
dlm_finalize_reco_handler,
- dlm, &dlm->dlm_domain_handlers);
+ dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+ sizeof(struct dlm_exit_domain),
+ dlm_begin_exit_domain_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
if (status)
goto bail;
@@ -1140,6 +1872,8 @@ bail:
static int dlm_join_domain(struct dlm_ctxt *dlm)
{
int status;
+ unsigned int backoff;
+ unsigned int total_backoff = 0;
BUG_ON(!dlm);
@@ -1163,6 +1897,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
+ status = dlm_debug_init(dlm);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
if (!dlm->dlm_worker) {
status = -ENOMEM;
@@ -1171,18 +1911,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
}
do {
- unsigned int backoff;
status = dlm_try_to_join_domain(dlm);
/* If we're racing another node to the join, then we
* need to back off temporarily and let them
* complete. */
+#define DLM_JOIN_TIMEOUT_MSECS 90000
if (status == -EAGAIN) {
if (signal_pending(current)) {
status = -ERESTARTSYS;
goto bail;
}
+ if (total_backoff >
+ msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+ status = -ERESTARTSYS;
+ mlog(ML_NOTICE, "Timed out joining dlm domain "
+ "%s after %u msecs\n", dlm->name,
+ jiffies_to_msecs(total_backoff));
+ goto bail;
+ }
+
/*
* <chip> After you!
* <dale> No, after you!
@@ -1192,6 +1941,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
*/
backoff = (unsigned int)(jiffies & 0x3);
backoff *= DLM_DOMAIN_BACKOFF_MS;
+ total_backoff += backoff;
mlog(0, "backoff %d\n", backoff);
msleep(backoff);
}
@@ -1208,6 +1958,7 @@ bail:
if (status) {
dlm_unregister_domain_handlers(dlm);
+ dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -1220,15 +1971,16 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
u32 key)
{
int i;
+ int ret;
struct dlm_ctxt *dlm = NULL;
- dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
+ dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
if (!dlm) {
mlog_errno(-ENOMEM);
goto leave;
}
- dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+ dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
mlog_errno(-ENOMEM);
kfree(dlm);
@@ -1248,20 +2000,44 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
for (i = 0; i < DLM_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
- strcpy(dlm->name, domain);
+ dlm->master_hash = (struct hlist_head **)
+ dlm_alloc_pagevec(DLM_HASH_PAGES);
+ if (!dlm->master_hash) {
+ mlog_errno(-ENOMEM);
+ dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
+ for (i = 0; i < DLM_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
+
dlm->key = key;
dlm->node_num = o2nm_this_node();
+ ret = dlm_create_debugfs_subroot(dlm);
+ if (ret < 0) {
+ dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+ dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+ kfree(dlm->name);
+ kfree(dlm);
+ dlm = NULL;
+ goto leave;
+ }
+
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
spin_lock_init(&dlm->ast_lock);
+ spin_lock_init(&dlm->track_lock);
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
- INIT_LIST_HEAD(&dlm->reco.received);
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+ INIT_LIST_HEAD(&dlm->tracking_list);
dlm->reco.state = 0;
INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1282,7 +2058,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
init_waitqueue_head(&dlm->reco.event);
init_waitqueue_head(&dlm->ast_wq);
init_waitqueue_head(&dlm->migration_wq);
- INIT_LIST_HEAD(&dlm->master_list);
INIT_LIST_HEAD(&dlm->mle_hb_events);
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1290,13 +2065,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
- atomic_set(&dlm->local_resources, 0);
- atomic_set(&dlm->remote_resources, 0);
- atomic_set(&dlm->unknown_resources, 0);
+
+ atomic_set(&dlm->res_tot_count, 0);
+ atomic_set(&dlm->res_cur_count, 0);
+ for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+ atomic_set(&dlm->mle_tot_count[i], 0);
+ atomic_set(&dlm->mle_cur_count[i], 0);
+ }
spin_lock_init(&dlm->work_lock);
INIT_LIST_HEAD(&dlm->work_list);
- INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
+ INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work);
kref_init(&dlm->dlm_refs);
dlm->dlm_state = DLM_CTXT_NEW;
@@ -1311,28 +2090,49 @@ leave:
}
/*
- * dlm_register_domain: one-time setup per "domain"
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int dlm_protocol_compare(struct dlm_protocol_version *existing,
+ struct dlm_protocol_version *request)
+{
+ if (existing->pv_major != request->pv_major)
+ return 1;
+
+ if (existing->pv_minor > request->pv_minor)
+ return 1;
+
+ if (existing->pv_minor < request->pv_minor)
+ request->pv_minor = existing->pv_minor;
+
+ return 0;
+}
+
+/*
+ * dlm_register_domain: one-time setup per "domain".
+ *
+ * The filesystem passes in the requested locking version via proto.
+ * If registration was successful, proto will contain the negotiated
+ * locking protocol.
*/
struct dlm_ctxt * dlm_register_domain(const char *domain,
- u32 key)
+ u32 key,
+ struct dlm_protocol_version *fs_proto)
{
int ret;
struct dlm_ctxt *dlm = NULL;
struct dlm_ctxt *new_ctxt = NULL;
- if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+ if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
ret = -ENAMETOOLONG;
mlog(ML_ERROR, "domain name length too long\n");
goto leave;
}
- if (!o2hb_check_local_node_heartbeating()) {
- mlog(ML_ERROR, "the local node has not been configured, or is "
- "not heartbeating\n");
- ret = -EPROTO;
- goto leave;
- }
-
mlog(0, "register called for domain \"%s\"\n", domain);
retry:
@@ -1357,6 +2157,16 @@ retry:
goto retry;
}
+ if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+ spin_unlock(&dlm_domain_lock);
+ mlog(ML_ERROR,
+ "Requested locking protocol version is not "
+ "compatible with already registered domain "
+ "\"%s\"\n", domain);
+ ret = -EPROTO;
+ goto leave;
+ }
+
__dlm_get(dlm);
dlm->num_joins++;
@@ -1387,6 +2197,13 @@ retry:
list_add_tail(&dlm->list, &dlm_domains);
spin_unlock(&dlm_domain_lock);
+ /*
+ * Pass the locking protocol version into the join. If the join
+ * succeeds, it will have the negotiated protocol set.
+ */
+ dlm->dlm_locking_proto = dlm_protocol;
+ dlm->fs_locking_proto = *fs_proto;
+
ret = dlm_join_domain(dlm);
if (ret) {
mlog_errno(ret);
@@ -1394,6 +2211,9 @@ retry:
goto leave;
}
+ /* Tell the caller what locking protocol we negotiated */
+ *fs_proto = dlm->fs_locking_proto;
+
ret = 0;
leave:
if (new_ctxt)
@@ -1420,22 +2240,36 @@ static int dlm_register_net_handlers(void)
status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
sizeof(struct dlm_query_join_request),
dlm_query_join_handler,
- NULL, &dlm_join_handlers);
+ NULL, NULL, &dlm_join_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
sizeof(struct dlm_assert_joined),
dlm_assert_joined_handler,
- NULL, &dlm_join_handlers);
+ NULL, NULL, &dlm_join_handlers);
if (status)
goto bail;
status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
sizeof(struct dlm_cancel_join),
dlm_cancel_join_handler,
- NULL, &dlm_join_handlers);
+ NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+ sizeof(struct dlm_query_region),
+ dlm_query_region_handler,
+ NULL, NULL, &dlm_join_handlers);
+
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ sizeof(struct dlm_query_nodeinfo),
+ dlm_query_nodeinfo_handler,
+ NULL, NULL, &dlm_join_handlers);
bail:
if (status < 0)
dlm_unregister_net_handlers();
@@ -1459,13 +2293,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num)
{
- struct list_head *iter;
struct dlm_eviction_cb *cb;
down_read(&dlm_callback_sem);
- list_for_each(iter, &dlm->dlm_eviction_callbacks) {
- cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
-
+ list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
cb->ec_func(node_num, cb->ec_data);
}
up_read(&dlm_callback_sem);
@@ -1502,29 +2333,56 @@ static int __init dlm_init(void)
{
int status;
- dlm_print_version();
-
status = dlm_init_mle_cache();
- if (status)
- return -1;
+ if (status) {
+ mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+ goto error;
+ }
+
+ status = dlm_init_master_caches();
+ if (status) {
+ mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+ "o2dlm_lockname slabcaches\n");
+ goto error;
+ }
+
+ status = dlm_init_lock_cache();
+ if (status) {
+ mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+ goto error;
+ }
status = dlm_register_net_handlers();
if (status) {
- dlm_destroy_mle_cache();
- return -1;
+ mlog(ML_ERROR, "Unable to register network handlers\n");
+ goto error;
}
+ status = dlm_create_debugfs_root();
+ if (status)
+ goto error;
+
return 0;
+error:
+ dlm_unregister_net_handlers();
+ dlm_destroy_lock_cache();
+ dlm_destroy_master_caches();
+ dlm_destroy_mle_cache();
+ return -1;
}
static void __exit dlm_exit (void)
{
+ dlm_destroy_debugfs_root();
dlm_unregister_net_handlers();
+ dlm_destroy_lock_cache();
+ dlm_destroy_master_caches();
dlm_destroy_mle_cache();
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
module_init(dlm_init);
module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
deleted file mode 100644
index d2be3ad841f..00000000000
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.3.3"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 42a1b91979b..66c2a491f68 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -53,6 +52,8 @@
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
+static struct kmem_cache *dlm_lock_cache;
+
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
@@ -64,6 +65,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
static void dlm_lock_release(struct kref *kref);
static void dlm_lock_detach_lockres(struct dlm_lock *lock);
+int dlm_init_lock_cache(void)
+{
+ dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+ sizeof(struct dlm_lock),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (dlm_lock_cache == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void dlm_destroy_lock_cache(void)
+{
+ if (dlm_lock_cache)
+ kmem_cache_destroy(dlm_lock_cache);
+}
+
/* Tell us whether we can grant a new lock request.
* locking:
* caller needs: res->spinlock
@@ -74,21 +91,19 @@ static void dlm_lock_detach_lockres(struct dlm_lock *lock);
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->granted, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
+ if (!dlm_lock_compatible(tmplock->ml.convert_type,
+ lock->ml.type))
+ return 0;
}
return 1;
@@ -108,7 +123,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
int call_ast = 0, kick_thread = 0;
enum dlm_status status = DLM_NORMAL;
- mlog_entry("type=%d\n", lock->ml.type);
+ mlog(0, "type=%d\n", lock->ml.type);
spin_lock(&res->spinlock);
/* if called from dlm_create_lock_handler, need to
@@ -158,6 +173,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
lock->ml.node);
}
} else {
+ status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
@@ -203,14 +219,20 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
enum dlm_status status = DLM_DENIED;
int lockres_changed = 1;
- mlog_entry("type=%d\n", lock->ml.type);
- mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+ mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+ lock->ml.type, res->lockname.len,
res->lockname.name, flags);
+ /*
+ * Wait if resource is getting recovered, remastered, etc.
+ * If the resource was remastered and new owner is self, then exit.
+ */
spin_lock(&res->spinlock);
-
- /* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
+ if (res->owner == dlm->node_num) {
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ }
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
@@ -248,7 +270,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
}
dlm_revert_pending_lock(res, lock);
dlm_lock_put(lock);
- } else if (dlm_is_recovery_lock(res->lockname.name,
+ } else if (dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
/* special case for the $RECOVERY lock.
* there will never be an AST delivered to put
@@ -284,8 +306,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
int tmpret, status = 0;
enum dlm_status ret;
- mlog_entry_void();
-
memset(&create, 0, sizeof(create));
create.node_idx = dlm->node_num;
create.requested_type = lock->ml.type;
@@ -297,25 +317,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
+ ret = status;
if (ret == DLM_REJECTED) {
- mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
- "no longer owned by %u. that node is coming back "
- "up currently.\n", dlm->name, create.namelen,
+ mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+ "owned by node %u. That node is coming back up "
+ "currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
- mlog_errno(tmpret);
- if (dlm_is_host_down(tmpret)) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+ "node %u\n", dlm->name, create.namelen, create.name,
+ tmpret, res->owner);
+ if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
- mlog(0, "node %u died so returning DLM_RECOVERING "
- "from lock message!\n", res->owner);
- } else {
+ else
ret = dlm_err_to_dlm_status(tmpret);
- }
}
return ret;
@@ -349,7 +367,7 @@ static void dlm_lock_release(struct kref *kref)
mlog(0, "freeing kernel-allocated lksb\n");
kfree(lock->lksb);
}
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
}
/* associate a lock with it's lockres, getting a ref on the lockres */
@@ -408,15 +426,15 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lock *lock;
int kernel_allocated = 0;
- lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
+ lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
if (!lksb) {
/* zero memory only if kernel-allocated */
- lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
+ lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
@@ -437,7 +455,8 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
* held on exit: none
* returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
*/
-int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
@@ -450,8 +469,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
BUG_ON(!dlm);
- mlog_entry_void();
-
if (!dlm_grab(dlm))
return DLM_REJECTED;
@@ -695,18 +712,10 @@ retry_lock:
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
- mlog(0, "retrying lock with migration/"
- "recovery/in progress\n");
msleep(100);
- /* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
-
- mlog(0, "%s: got RECOVERING "
- "for $RECOVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
@@ -718,6 +727,14 @@ retry_lock:
}
}
+ /* Inflight taken in dlm_get_lock_resource() is dropped here */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ dlm_lockres_calc_usage(dlm, res);
+ dlm_kick_thread(dlm, res);
+
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f784177b624..82abf0cc9a1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -48,47 +47,11 @@
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
+#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
#include "cluster/masklog.h"
-enum dlm_mle_type {
- DLM_MLE_BLOCK,
- DLM_MLE_MASTER,
- DLM_MLE_MIGRATION
-};
-
-struct dlm_lock_name
-{
- u8 len;
- u8 name[DLM_LOCKID_NAME_MAX];
-};
-
-struct dlm_master_list_entry
-{
- struct list_head list;
- struct list_head hb_events;
- struct dlm_ctxt *dlm;
- spinlock_t spinlock;
- wait_queue_head_t wq;
- atomic_t woken;
- struct kref mle_refs;
- int inuse;
- unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- u8 master;
- u8 new_master;
- enum dlm_mle_type type;
- struct o2hb_callback_func mle_hb_up;
- struct o2hb_callback_func mle_hb_down;
- union {
- struct dlm_lock_resource *res;
- struct dlm_lock_name name;
- } u;
-};
-
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
struct o2nm_node *node,
@@ -99,130 +62,29 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
int idx);
static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
-static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
- unsigned int namelen, void *nodemap,
- u32 flags);
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res,
+ void *nodemap, u32 flags);
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
const char *name,
unsigned int namelen)
{
- struct dlm_lock_resource *res;
-
if (dlm != mle->dlm)
return 0;
- if (mle->type == DLM_MLE_BLOCK ||
- mle->type == DLM_MLE_MIGRATION) {
- if (namelen != mle->u.name.len ||
- memcmp(name, mle->u.name.name, namelen)!=0)
- return 0;
- } else {
- res = mle->u.res;
- if (namelen != res->lockname.len ||
- memcmp(res->lockname.name, name, namelen) != 0)
- return 0;
- }
- return 1;
-}
-
-#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m)
-static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
-{
- int i;
- printk("%s=[ ", mapname);
- for (i=0; i<O2NM_MAX_NODES; i++)
- if (test_bit(i, map))
- printk("%d ", i);
- printk("]");
-}
-
-static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
-{
- int refs;
- char *type;
- char attached;
- u8 master;
- unsigned int namelen;
- const char *name;
- struct kref *k;
- unsigned long *maybe = mle->maybe_map,
- *vote = mle->vote_map,
- *resp = mle->response_map,
- *node = mle->node_map;
-
- k = &mle->mle_refs;
- if (mle->type == DLM_MLE_BLOCK)
- type = "BLK";
- else if (mle->type == DLM_MLE_MASTER)
- type = "MAS";
- else
- type = "MIG";
- refs = atomic_read(&k->refcount);
- master = mle->master;
- attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
-
- if (mle->type != DLM_MLE_MASTER) {
- namelen = mle->u.name.len;
- name = mle->u.name.name;
- } else {
- namelen = mle->u.res->lockname.len;
- name = mle->u.res->lockname.name;
- }
-
- mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
- namelen, name, type, refs, master, mle->new_master, attached,
- mle->inuse);
- dlm_print_nodemap(maybe);
- printk(", ");
- dlm_print_nodemap(vote);
- printk(", ");
- dlm_print_nodemap(resp);
- printk(", ");
- dlm_print_nodemap(node);
- printk(", ");
- printk("\n");
-}
-
-#if 0
-/* Code here is included but defined out as it aids debugging */
-
-static void dlm_dump_mles(struct dlm_ctxt *dlm)
-{
- struct dlm_master_list_entry *mle;
- struct list_head *iter;
-
- mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
- spin_lock(&dlm->master_lock);
- list_for_each(iter, &dlm->master_list) {
- mle = list_entry(iter, struct dlm_master_list_entry, list);
- dlm_print_one_mle(mle);
- }
- spin_unlock(&dlm->master_lock);
-}
-
-int dlm_dump_all_mles(const char __user *data, unsigned int len)
-{
- struct list_head *iter;
- struct dlm_ctxt *dlm;
+ if (namelen != mle->mnamelen ||
+ memcmp(name, mle->mname, namelen) != 0)
+ return 0;
- spin_lock(&dlm_domain_lock);
- list_for_each(iter, &dlm_domains) {
- dlm = list_entry (iter, struct dlm_ctxt, list);
- mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
- dlm_dump_mles(dlm);
- }
- spin_unlock(&dlm_domain_lock);
- return len;
+ return 1;
}
-EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
-
-#endif /* 0 */
-
-
-static kmem_cache_t *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
@@ -237,7 +99,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
struct dlm_master_list_entry **mle,
char *name, unsigned int namelen);
-static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+ struct dlm_master_list_entry *mle, int to);
static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
@@ -410,7 +273,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->dlm = dlm;
mle->type = type;
- INIT_LIST_HEAD(&mle->list);
+ INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
spin_lock_init(&mle->spinlock);
@@ -422,19 +285,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
+ BUG_ON(mle->type != DLM_MLE_BLOCK &&
+ mle->type != DLM_MLE_MASTER &&
+ mle->type != DLM_MLE_MIGRATION);
+
if (mle->type == DLM_MLE_MASTER) {
BUG_ON(!res);
- mle->u.res = res;
- } else if (mle->type == DLM_MLE_BLOCK) {
- BUG_ON(!name);
- memcpy(mle->u.name.name, name, namelen);
- mle->u.name.len = namelen;
- } else /* DLM_MLE_MIGRATION */ {
+ mle->mleres = res;
+ memcpy(mle->mname, res->lockname.name, res->lockname.len);
+ mle->mnamelen = res->lockname.len;
+ mle->mnamehash = res->lockname.hash;
+ } else {
BUG_ON(!name);
- memcpy(mle->u.name.name, name, namelen);
- mle->u.name.len = namelen;
+ mle->mleres = NULL;
+ memcpy(mle->mname, name, namelen);
+ mle->mnamelen = namelen;
+ mle->mnamehash = dlm_lockid_hash(name, namelen);
}
+ atomic_inc(&dlm->mle_tot_count[mle->type]);
+ atomic_inc(&dlm->mle_cur_count[mle->type]);
+
/* copy off the node_map and register hb callbacks on our copy */
memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -445,6 +316,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
__dlm_mle_attach_hb_events(dlm, mle);
}
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&dlm->master_lock);
+
+ if (!hlist_unhashed(&mle->master_hash_node))
+ hlist_del_init(&mle->master_hash_node);
+}
+
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+ struct hlist_head *bucket;
+
+ assert_spin_locked(&dlm->master_lock);
+
+ bucket = dlm_master_hash(dlm, mle->mnamehash);
+ hlist_add_head(&mle->master_hash_node, bucket);
+}
/* returns 1 if found, 0 if not */
static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -452,12 +341,14 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
char *name, unsigned int namelen)
{
struct dlm_master_list_entry *tmpmle;
- struct list_head *iter;
+ struct hlist_head *bucket;
+ unsigned int hash;
assert_spin_locked(&dlm->master_lock);
- list_for_each(iter, &dlm->master_list) {
- tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
+ hash = dlm_lockid_hash(name, namelen);
+ bucket = dlm_master_hash(dlm, hash);
+ hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
continue;
dlm_get_mle(tmpmle);
@@ -470,13 +361,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
{
struct dlm_master_list_entry *mle;
- struct list_head *iter;
assert_spin_locked(&dlm->spinlock);
-
- list_for_each(iter, &dlm->mle_hb_events) {
- mle = list_entry(iter, struct dlm_master_list_entry,
- hb_events);
+
+ list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
if (node_up)
dlm_mle_node_up(dlm, mle, NULL, idx);
else
@@ -515,10 +403,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
int dlm_init_mle_cache(void)
{
- dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+ dlm_mle_cache = kmem_cache_create("o2dlm_mle",
sizeof(struct dlm_master_list_entry),
0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
+ NULL);
if (dlm_mle_cache == NULL)
return -ENOMEM;
return 0;
@@ -535,29 +423,23 @@ static void dlm_mle_release(struct kref *kref)
struct dlm_master_list_entry *mle;
struct dlm_ctxt *dlm;
- mlog_entry_void();
-
mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
dlm = mle->dlm;
- if (mle->type != DLM_MLE_MASTER) {
- mlog(0, "calling mle_release for %.*s, type %d\n",
- mle->u.name.len, mle->u.name.name, mle->type);
- } else {
- mlog(0, "calling mle_release for %.*s, type %d\n",
- mle->u.res->lockname.len,
- mle->u.res->lockname.name, mle->type);
- }
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
+ mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+ mle->type);
+
/* remove from list if not already */
- if (!list_empty(&mle->list))
- list_del_init(&mle->list);
+ __dlm_unlink_mle(dlm, mle);
/* detach the mle from the domain node up/down events */
__dlm_mle_detach_hb_events(dlm, mle);
+ atomic_dec(&dlm->mle_cur_count[mle->type]);
+
/* NOTE: kfree under spinlock here.
* if this is bad, we can move this to a freelist. */
kmem_cache_free(dlm_mle_cache, mle);
@@ -568,48 +450,46 @@ static void dlm_mle_release(struct kref *kref)
* LOCK RESOURCE FUNCTIONS
*/
-static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- u8 owner)
+int dlm_init_master_caches(void)
{
- assert_spin_locked(&res->spinlock);
-
- mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
+ dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+ sizeof(struct dlm_lock_resource),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!dlm_lockres_cache)
+ goto bail;
+
+ dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+ DLM_LOCKID_NAME_MAX, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!dlm_lockname_cache)
+ goto bail;
- if (owner == dlm->node_num)
- atomic_inc(&dlm->local_resources);
- else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
- atomic_inc(&dlm->unknown_resources);
- else
- atomic_inc(&dlm->remote_resources);
-
- res->owner = owner;
+ return 0;
+bail:
+ dlm_destroy_master_caches();
+ return -ENOMEM;
}
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res, u8 owner)
+void dlm_destroy_master_caches(void)
{
- assert_spin_locked(&res->spinlock);
-
- if (owner == res->owner)
- return;
-
- if (res->owner == dlm->node_num)
- atomic_dec(&dlm->local_resources);
- else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
- atomic_dec(&dlm->unknown_resources);
- else
- atomic_dec(&dlm->remote_resources);
+ if (dlm_lockname_cache) {
+ kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
+ }
- dlm_set_lockres_owner(dlm, res, owner);
+ if (dlm_lockres_cache) {
+ kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
+ }
}
-
static void dlm_lockres_release(struct kref *kref)
{
struct dlm_lock_resource *res;
+ struct dlm_ctxt *dlm;
res = container_of(kref, struct dlm_lock_resource, refs);
+ dlm = res->dlm;
/* This should not happen -- all lockres' have a name
* associated with them at init time. */
@@ -618,6 +498,18 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ atomic_dec(&dlm->res_cur_count);
+
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -650,9 +542,9 @@ static void dlm_lockres_release(struct kref *kref)
BUG_ON(!list_empty(&res->recovering));
BUG_ON(!list_empty(&res->purge));
- kfree(res->lockname.name);
+ kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
- kfree(res);
+ kmem_cache_free(dlm_lockres_cache, res);
}
void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -685,11 +577,19 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
INIT_LIST_HEAD(&res->dirty);
INIT_LIST_HEAD(&res->recovering);
INIT_LIST_HEAD(&res->purge);
+ INIT_LIST_HEAD(&res->tracking);
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
+ res->inflight_locks = 0;
+ res->inflight_assert_workers = 0;
+
+ res->dlm = dlm;
kref_init(&res->refs);
+ atomic_inc(&dlm->res_tot_count);
+ atomic_inc(&dlm->res_cur_count);
+
/* just for consistency */
spin_lock(&res->spinlock);
dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -699,27 +599,126 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
+ spin_lock(&dlm->spinlock);
+ list_add_tail(&res->tracking, &dlm->tracking_list);
+ spin_unlock(&dlm->spinlock);
+
memset(res->lvb, 0, DLM_LVB_LEN);
+ memset(res->refmap, 0, sizeof(res->refmap));
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen)
{
- struct dlm_lock_resource *res;
+ struct dlm_lock_resource *res = NULL;
- res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
+ res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
if (!res)
- return NULL;
+ goto error;
- res->lockname.name = kmalloc(namelen, GFP_NOFS);
- if (!res->lockname.name) {
- kfree(res);
- return NULL;
- }
+ res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+ if (!res->lockname.name)
+ goto error;
dlm_init_lockres(dlm, res, name, namelen);
return res;
+
+error:
+ if (res && res->lockname.name)
+ kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+ if (res)
+ kmem_cache_free(dlm_lockres_cache, res);
+ return NULL;
+}
+
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ clear_bit(bit, res->refmap);
+}
+
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+
+ res->inflight_locks++;
+
+ mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+}
+
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+
+ BUG_ON(res->inflight_locks == 0);
+
+ res->inflight_locks--;
+
+ mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+
+ wake_up(&res->wq);
+}
+
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ res->inflight_assert_workers++;
+ mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ BUG_ON(res->inflight_assert_workers == 0);
+ res->inflight_assert_workers--;
+ mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_drop_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
}
/*
@@ -761,10 +760,35 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
lookup:
spin_lock(&dlm->spinlock);
- tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
+ tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
spin_unlock(&dlm->spinlock);
- mlog(0, "found in hash!\n");
+ spin_lock(&tmpres->spinlock);
+ /* Wait on the thread that is mastering the resource */
+ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ __dlm_wait_on_lockres(tmpres);
+ BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
+ }
+
+ /* Wait on the resource purge to complete before continuing */
+ if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+ BUG_ON(tmpres->owner == dlm->node_num);
+ __dlm_wait_on_lockres_flags(tmpres,
+ DLM_LOCK_RES_DROPPING_REF);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
+ }
+
+ /* Grab inflight ref to pin the resource */
+ dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+ spin_unlock(&tmpres->spinlock);
if (res)
dlm_lockres_put(res);
res = tmpres;
@@ -775,8 +799,7 @@ lookup:
spin_unlock(&dlm->spinlock);
mlog(0, "allocating a new resource\n");
/* nothing found and we need to allocate one. */
- alloc_mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!alloc_mle)
goto leave;
res = dlm_new_lockres(dlm, lockid, namelen);
@@ -793,6 +816,7 @@ lookup:
spin_lock(&res->spinlock);
dlm_change_lockres_owner(dlm, res, dlm->node_num);
__dlm_insert_lockres(dlm, res);
+ dlm_lockres_grab_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
/* lockres still marked IN_PROGRESS */
@@ -805,29 +829,40 @@ lookup:
/* if we found a block, wait for lock to be mastered by another node */
blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
if (blocked) {
+ int mig;
if (mle->type == DLM_MLE_MASTER) {
mlog(ML_ERROR, "master entry for nonexistent lock!\n");
BUG();
- } else if (mle->type == DLM_MLE_MIGRATION) {
- /* migration is in progress! */
- /* the good news is that we now know the
- * "current" master (mle->master). */
-
+ }
+ mig = (mle->type == DLM_MLE_MIGRATION);
+ /* if there is a migration in progress, let the migration
+ * finish before continuing. we can wait for the absence
+ * of the MIGRATION mle: either the migrate finished or
+ * one of the nodes died and the mle was cleaned up.
+ * if there is a BLOCK here, but it already has a master
+ * set, we are too late. the master does not have a ref
+ * for us in the refmap. detach the mle and drop it.
+ * either way, go back to the top and start over. */
+ if (mig || mle->master != O2NM_MAX_NODES) {
+ BUG_ON(mig && mle->master == dlm->node_num);
+ /* we arrived too late. the master does not
+ * have a ref for us. retry. */
+ mlog(0, "%s:%.*s: late on %s\n",
+ dlm->name, namelen, lockid,
+ mig ? "MIGRATION" : "BLOCK");
spin_unlock(&dlm->master_lock);
- assert_spin_locked(&dlm->spinlock);
-
- /* set the lockres owner and hash it */
- spin_lock(&res->spinlock);
- dlm_set_lockres_owner(dlm, res, mle->master);
- __dlm_insert_lockres(dlm, res);
- spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
/* master is known, detach */
- dlm_mle_detach_hb_events(dlm, mle);
+ if (!mig)
+ dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
mle = NULL;
- goto wake_waiters;
+ /* this is lame, but we can't wait on either
+ * the mle or lockres waitqueue here */
+ if (mig)
+ msleep(100);
+ goto lookup;
}
} else {
/* go ahead and try to master lock on this node */
@@ -836,16 +871,16 @@ lookup:
alloc_mle = NULL;
dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
set_bit(dlm->node_num, mle->maybe_map);
- list_add(&mle->list, &dlm->master_list);
+ __dlm_insert_mle(dlm, mle);
/* still holding the dlm spinlock, check the recovery map
- * to see if there are any nodes that still need to be
+ * to see if there are any nodes that still need to be
* considered. these will not appear in the mle nodemap
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
@@ -858,6 +893,12 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
+
+ /* Grab inflight ref to pin the resource */
+ spin_lock(&res->spinlock);
+ dlm_lockres_grab_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
* ref at this time in the assert master handler, so we
@@ -872,8 +913,8 @@ redo_request:
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
- mlog(ML_NOTICE, "%s: recovery map is not empty, but "
- "must master $RECOVERY lock now\n", dlm->name);
+ mlog(0, "%s: Recovery map is not empty, but must "
+ "master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
@@ -882,7 +923,7 @@ redo_request:
msleep(500);
}
continue;
- }
+ }
dlm_kick_recovery_thread(dlm);
msleep(1000);
@@ -891,8 +932,8 @@ redo_request:
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
@@ -910,7 +951,7 @@ redo_request:
ret = -EINVAL;
dlm_node_iter_init(mle->vote_map, &iter);
while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
- ret = dlm_do_master_request(mle, nodenum);
+ ret = dlm_do_master_request(res, mle, nodenum);
if (ret < 0)
mlog_errno(ret);
if (mle->master != O2NM_MAX_NODES) {
@@ -921,8 +962,8 @@ redo_request:
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
- mlog(0, "%s:%.*s: requests only up to %u but master "
- "is %u, keep going\n", dlm->name, namelen,
+ mlog(0, "%s: res %.*s, Requests only up to %u but "
+ "master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
@@ -932,14 +973,13 @@ wait:
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
- mlog(0, "%s:%.*s: node map changed, redo the "
- "master request now, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Node map changed, redo the master "
+ "request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
- mlog(ML_ERROR, "%s:%.*s: spinning on "
- "dlm_wait_for_lock_mastery, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+ "dlm_wait_for_lock_mastery, blocked = %d\n",
+ dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
dlm_print_one_mle(mle);
@@ -948,7 +988,8 @@ wait:
goto redo_request;
}
- mlog(0, "lockres mastered by %u\n", res->owner);
+ mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -998,7 +1039,7 @@ recheck:
/* this will cause the master to re-assert across
* the whole cluster, freeing up mles */
if (res->owner != dlm->node_num) {
- ret = dlm_do_master_request(mle, res->owner);
+ ret = dlm_do_master_request(res, mle, res->owner);
if (ret < 0) {
/* give recovery a chance to run */
mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
@@ -1026,7 +1067,7 @@ recheck:
ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
b = (mle->type == DLM_MLE_BLOCK);
if ((*blocked && !b) || (!*blocked && b)) {
- mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
+ mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
dlm->name, res->lockname.len, res->lockname.name,
*blocked, b);
*blocked = b;
@@ -1062,6 +1103,8 @@ recheck:
* now tell other nodes that I am
* mastering this. */
mle->master = dlm->node_num;
+ /* ref was grabbed in get_lock_resource
+ * will be dropped in dlmlock_master */
assert = 1;
sleep = 0;
}
@@ -1087,7 +1130,8 @@ recheck:
(atomic_read(&mle->woken) == 1),
timeo);
if (res->owner == O2NM_MAX_NODES) {
- mlog(0, "waiting again\n");
+ mlog(0, "%s:%.*s: waiting again\n", dlm->name,
+ res->lockname.len, res->lockname.name);
goto recheck;
}
mlog(0, "done waiting, master is %u\n", res->owner);
@@ -1100,8 +1144,7 @@ recheck:
m = dlm->node_num;
mlog(0, "about to master %.*s here, this=%u\n",
res->lockname.len, res->lockname.name, m);
- ret = dlm_do_assert_master(dlm, res->lockname.name,
- res->lockname.len, mle->vote_map, 0);
+ ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
if (ret) {
/* This is a failure in the network path,
* not in the response to the assert_master
@@ -1117,6 +1160,8 @@ recheck:
/* set the lockres owner */
spin_lock(&res->spinlock);
+ /* mastery reference obtained either during
+ * assert_master_handler or in get_lock_resource */
dlm_change_lockres_owner(dlm, res, m);
spin_unlock(&res->spinlock);
@@ -1250,7 +1295,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
res->lockname.len,
res->lockname.name);
mle->type = DLM_MLE_MASTER;
- mle->u.res = res;
+ mle->mleres = res;
}
}
}
@@ -1283,7 +1328,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
*
*/
-static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
+static int dlm_do_master_request(struct dlm_lock_resource *res,
+ struct dlm_master_list_entry *mle, int to)
{
struct dlm_ctxt *dlm = mle->dlm;
struct dlm_master_request request;
@@ -1294,14 +1340,8 @@ static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
BUG_ON(mle->type == DLM_MLE_MIGRATION);
- if (mle->type != DLM_MLE_MASTER) {
- request.namelen = mle->u.name.len;
- memcpy(request.name, mle->u.name.name, request.namelen);
- } else {
- request.namelen = mle->u.res->lockname.len;
- memcpy(request.name, mle->u.res->lockname.name,
- request.namelen);
- }
+ request.namelen = (u8)mle->mnamelen;
+ memcpy(request.name, mle->mname, request.namelen);
again:
ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1339,6 +1379,9 @@ again:
case DLM_MASTER_RESP_YES:
set_bit(to, mle->response_map);
mlog(0, "node %u is the master, response=YES\n", to);
+ mlog(0, "%s:%.*s: master node %u now knows I have a "
+ "reference\n", dlm->name, res->lockname.len,
+ res->lockname.name, to);
mle->master = to;
break;
case DLM_MASTER_RESP_NO:
@@ -1379,7 +1422,8 @@ out:
*
* if possible, TRIM THIS DOWN!!!
*/
-int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
u8 response = DLM_MASTER_RESP_MAYBE;
struct dlm_ctxt *dlm = data;
@@ -1417,10 +1461,11 @@ way_up_top:
/* take care of the easy cases up front */
spin_lock(&res->spinlock);
- if (res->state & DLM_LOCK_RES_RECOVERING) {
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_MIGRATING)) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
- "being recovered\n");
+ "being recovered/migrated\n");
response = DLM_MASTER_RESP_ERROR;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
@@ -1428,8 +1473,8 @@ way_up_top:
}
if (res->owner == dlm->node_num) {
+ dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
- // mlog(0, "this node is the master\n");
response = DLM_MASTER_RESP_YES;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
@@ -1477,7 +1522,6 @@ way_up_top:
mlog(0, "node %u is master, but trying to migrate to "
"node %u.\n", tmpmle->master, tmpmle->new_master);
if (tmpmle->master == dlm->node_num) {
- response = DLM_MASTER_RESP_YES;
mlog(ML_ERROR, "no owner on lockres, but this "
"node is trying to migrate it to %u?!\n",
tmpmle->new_master);
@@ -1494,6 +1538,8 @@ way_up_top:
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
+ dlm_lockres_set_refmap_bit(dlm, res,
+ request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
@@ -1530,8 +1576,7 @@ way_up_top:
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
response = DLM_MASTER_RESP_ERROR;
mlog_errno(-ENOMEM);
@@ -1544,7 +1589,7 @@ way_up_top:
// "add the block.\n");
dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
set_bit(request->node_idx, mle->maybe_map);
- list_add(&mle->list, &dlm->master_list);
+ __dlm_insert_mle(dlm, mle);
response = DLM_MASTER_RESP_NO;
} else {
// mlog(0, "mle was found\n");
@@ -1575,7 +1620,12 @@ way_up_top:
dlm_put_mle(tmpmle);
}
send_response:
-
+ /*
+ * __dlm_lookup_lockres() grabbed a reference to this lockres.
+ * The reference is released by dlm_assert_master_worker() under
+ * the call to dlm_dispatch_assert_master(). If
+ * dlm_assert_master_worker() isn't called, we drop it here.
+ */
if (dispatch_assert) {
if (response != DLM_MASTER_RESP_YES)
mlog(ML_ERROR, "invalid response %d\n", response);
@@ -1585,12 +1635,17 @@ send_response:
}
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
- ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
+ ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
DLM_ASSERT_MASTER_MLE_CLEANUP);
if (ret < 0) {
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
- }
+ dlm_lockres_put(res);
+ } else
+ dlm_lockres_grab_inflight_worker(dlm, res);
+ } else {
+ if (res)
+ dlm_lockres_put(res);
}
dlm_put(dlm);
@@ -1607,17 +1662,24 @@ send_response:
* can periodically run all locks owned by this node
* and re-assert across the cluster...
*/
-static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
- unsigned int namelen, void *nodemap,
- u32 flags)
+static int dlm_do_assert_master(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res,
+ void *nodemap, u32 flags)
{
struct dlm_assert_master assert;
int to, tmpret;
struct dlm_node_iter iter;
int ret = 0;
int reassert;
+ const char *lockname = res->lockname.name;
+ unsigned int namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ spin_lock(&res->spinlock);
+ res->state |= DLM_LOCK_RES_SETREF_INPROG;
+ spin_unlock(&res->spinlock);
+
again:
reassert = 0;
@@ -1638,7 +1700,9 @@ again:
tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
&assert, sizeof(assert), to, &r);
if (tmpret < 0) {
- mlog(0, "assert_master returned %d!\n", tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", tmpret,
+ DLM_ASSERT_MASTER_MSG, dlm->key, to);
if (!dlm_is_host_down(tmpret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
BUG();
@@ -1647,6 +1711,7 @@ again:
mlog(0, "link to %d went down!\n", to);
/* any nonzero status return will do */
ret = tmpret;
+ r = 0;
} else if (r < 0) {
/* ok, something horribly messed. kill thyself. */
mlog(ML_ERROR,"during assert master of %.*s to %u, "
@@ -1661,17 +1726,39 @@ again:
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
BUG();
- } else if (r == EAGAIN) {
+ }
+
+ if (r & DLM_ASSERT_RESPONSE_REASSERT &&
+ !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
+ mlog(ML_ERROR, "%.*s: very strange, "
+ "master MLE but no lockres on %u\n",
+ namelen, lockname, to);
+ }
+
+ if (r & DLM_ASSERT_RESPONSE_REASSERT) {
mlog(0, "%.*s: node %u create mles on other "
- "nodes and requests a re-assert\n",
+ "nodes and requests a re-assert\n",
namelen, lockname, to);
reassert = 1;
}
+ if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
+ mlog(0, "%.*s: node %u has a reference to this "
+ "lockres, set the bit in the refmap\n",
+ namelen, lockname, to);
+ spin_lock(&res->spinlock);
+ dlm_lockres_set_refmap_bit(dlm, res, to);
+ spin_unlock(&res->spinlock);
+ }
}
if (reassert)
goto again;
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
return ret;
}
@@ -1684,7 +1771,8 @@ again:
*
* if possible, TRIM THIS DOWN!!!
*/
-int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_master_list_entry *mle = NULL;
@@ -1693,7 +1781,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
char *name;
unsigned int namelen, hash;
u32 flags;
- int master_request = 0;
+ int master_request = 0, have_lockres_ref = 0;
int ret = 0;
if (!dlm_grab(dlm))
@@ -1760,7 +1848,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
goto done;
- }
+ }
}
}
spin_unlock(&dlm->master_lock);
@@ -1778,12 +1866,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
if (!mle) {
if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
res->owner != assert->node_idx) {
- mlog(ML_ERROR, "assert_master from "
- "%u, but current owner is "
- "%u! (%.*s)\n",
- assert->node_idx, res->owner,
- namelen, name);
- goto kill;
+ mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+ "but current owner is %u! (%.*s)\n",
+ assert->node_idx, res->owner, namelen,
+ name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
}
} else if (mle->type != DLM_MLE_MIGRATION) {
if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -1823,7 +1911,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
ok:
spin_unlock(&res->spinlock);
}
- spin_unlock(&dlm->spinlock);
// mlog(0, "woo! got an assert_master from node %u!\n",
// assert->node_idx);
@@ -1831,7 +1918,7 @@ ok:
int extra_ref = 0;
int nn = -1;
int rr, err = 0;
-
+
spin_lock(&mle->spinlock);
if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
extra_ref = 1;
@@ -1839,10 +1926,12 @@ ok:
/* MASTER mle: if any bits set in the response map
* then the calling node needs to re-assert to clear
* up nodes that this node contacted */
- while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
+ while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
- if (nn != dlm->node_num && nn != assert->node_idx)
+ if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
+ break;
+ }
}
}
mle->master = assert->node_idx;
@@ -1851,6 +1940,7 @@ ok:
spin_unlock(&mle->spinlock);
if (res) {
+ int wake = 0;
spin_lock(&res->spinlock);
if (mle->type == DLM_MLE_MIGRATION) {
mlog(0, "finishing off migration of lockres %.*s, "
@@ -1858,18 +1948,21 @@ ok:
res->lockname.len, res->lockname.name,
dlm->node_num, mle->new_master);
res->state &= ~DLM_LOCK_RES_MIGRATING;
+ wake = 1;
dlm_change_lockres_owner(dlm, res, mle->new_master);
BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
} else {
dlm_change_lockres_owner(dlm, res, mle->master);
}
spin_unlock(&res->spinlock);
+ have_lockres_ref = 1;
+ if (wake)
+ wake_up(&res->wq);
}
/* master is known, detach if not already detached.
* ensures that only one assert_master call will happen
* on this mle. */
- spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
rr = atomic_read(&mle->mle_refs.refcount);
@@ -1891,7 +1984,7 @@ ok:
assert->node_idx, rr, extra_ref, mle->inuse);
dlm_print_one_mle(mle);
}
- list_del_init(&mle->list);
+ __dlm_unlink_mle(dlm, mle);
__dlm_mle_detach_hb_events(dlm, mle);
__dlm_put_mle(mle);
if (extra_ref) {
@@ -1902,7 +1995,6 @@ ok:
__dlm_put_mle(mle);
}
spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
} else if (res) {
if (res->owner != assert->node_idx) {
mlog(0, "assert_master from %u, but current "
@@ -1910,15 +2002,32 @@ ok:
res->owner, namelen, name);
}
}
+ spin_unlock(&dlm->spinlock);
done:
ret = 0;
- if (res)
- dlm_lockres_put(res);
+ if (res) {
+ spin_lock(&res->spinlock);
+ res->state |= DLM_LOCK_RES_SETREF_INPROG;
+ spin_unlock(&res->spinlock);
+ *ret_data = (void *)res;
+ }
dlm_put(dlm);
if (master_request) {
mlog(0, "need to tell master to reassert\n");
- ret = EAGAIN; // positive. negative would shoot down the node.
+ /* positive. negative would shoot down the node. */
+ ret |= DLM_ASSERT_RESPONSE_REASSERT;
+ if (!have_lockres_ref) {
+ mlog(ML_ERROR, "strange, got assert from %u, MASTER "
+ "mle present here for %s:%.*s, but no lockres!\n",
+ assert->node_idx, dlm->name, namelen, name);
+ }
+ }
+ if (have_lockres_ref) {
+ /* let the master know we have a reference to the lockres */
+ ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
+ dlm->name, namelen, name, assert->node_idx);
}
return ret;
@@ -1929,17 +2038,31 @@ kill:
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
- dlm_lockres_put(res);
+ *ret_data = (void *)res;
dlm_put(dlm);
return -EINVAL;
}
+void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
+{
+ struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
+
+ if (ret_data) {
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ }
+ return;
+}
+
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
int ignore_higher, u8 request_from, u32 flags)
{
struct dlm_work_item *item;
- item = kcalloc(1, sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_ATOMIC);
if (!item)
return -ENOMEM;
@@ -1953,10 +2076,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
item->u.am.request_from = request_from;
item->u.am.flags = flags;
- if (ignore_higher)
- mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
+ if (ignore_higher)
+ mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
res->lockname.name);
-
+
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
@@ -2023,9 +2146,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
* even if one or more nodes die */
mlog(0, "worker about to master %.*s here, this=%u\n",
res->lockname.len, res->lockname.name, dlm->node_num);
- ret = dlm_do_assert_master(dlm, res->lockname.name,
- res->lockname.len,
- nodemap, flags);
+ ret = dlm_do_assert_master(dlm, res, nodemap, flags);
if (ret < 0) {
/* no need to restart, we are done */
if (!dlm_is_host_down(ret))
@@ -2036,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
dlm_lockres_release_ast(dlm, res);
put:
+ dlm_lockres_drop_inflight_worker(dlm, res);
+
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
@@ -2048,7 +2171,7 @@ put:
* think that $RECOVERY is currently mastered by a dead node. If so,
* we wait a short time to allow that node to get notified by its own
* heartbeat stack, then check again. All $RECOVERY lock resources
- * mastered by dead nodes are purged when the hearbeat callback is
+ * mastered by dead nodes are purged when the hearbeat callback is
* fired, so we can know for sure that it is safe to continue once
* the node returns a live node or no node. */
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2089,7 +2212,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
ret = -EAGAIN;
}
spin_unlock(&dlm->spinlock);
- mlog(0, "%s: reco lock master is %u\n", dlm->name,
+ mlog(0, "%s: reco lock master is %u\n", dlm->name,
master);
break;
}
@@ -2097,83 +2220,253 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
return ret;
}
-
/*
- * DLM_MIGRATE_LOCKRES
+ * DLM_DEREF_LOCKRES_MSG
*/
+int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+ struct dlm_deref_lockres deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
+ &deref, sizeof(deref), res->owner, &r);
+ if (ret < 0)
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+ dlm->name, namelen, lockname, ret, res->owner);
+ else if (r < 0) {
+ /* BAD. other node says I did not have a ref. */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, res->owner, r);
+ dlm_print_one_lock_resource(res);
+ BUG();
+ }
+ return ret;
+}
-int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
- u8 target)
+int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
- struct dlm_master_list_entry *mle = NULL;
- struct dlm_master_list_entry *oldmle = NULL;
- struct dlm_migratable_lockres *mres = NULL;
- int ret = -EINVAL;
- const char *name;
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
unsigned int namelen;
- int mle_added = 0;
- struct list_head *queue, *iter;
- int i;
- struct dlm_lock *lock;
- int empty = 1;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+ struct dlm_work_item *item;
+ int cleared = 0;
+ int dispatch = 0;
if (!dlm_grab(dlm))
- return -EINVAL;
+ return 0;
- name = res->lockname.name;
- namelen = res->lockname.len;
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
- mlog(0, "migrating %.*s to %u\n", namelen, name, target);
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+ spin_unlock(&dlm->spinlock);
- /*
- * ensure this lockres is a proper candidate for migration
- */
spin_lock(&res->spinlock);
- if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
- mlog(0, "cannot migrate lockres with unknown owner!\n");
- spin_unlock(&res->spinlock);
- goto leave;
+ if (res->state & DLM_LOCK_RES_SETREF_INPROG)
+ dispatch = 1;
+ else {
+ BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ if (test_bit(node, res->refmap)) {
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
+ cleared = 1;
+ }
}
- if (res->owner != dlm->node_num) {
- mlog(0, "cannot migrate lockres this node doesn't own!\n");
- spin_unlock(&res->spinlock);
- goto leave;
+ spin_unlock(&res->spinlock);
+
+ if (!dispatch) {
+ if (cleared)
+ dlm_lockres_calc_usage(dlm, res);
+ else {
+ mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+ "but it is already dropped!\n", dlm->name,
+ res->lockname.len, res->lockname.name, node);
+ dlm_print_one_lock_resource(res);
+ }
+ ret = 0;
+ goto done;
}
- mlog(0, "checking queues...\n");
- queue = &res->granted;
- for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
- empty = 0;
- if (lock->ml.node == dlm->node_num) {
- mlog(0, "found a lock owned by this node "
- "still on the %s queue! will not "
- "migrate this lockres\n",
- i==0 ? "granted" :
- (i==1 ? "converting" : "blocked"));
- spin_unlock(&res->spinlock);
- ret = -ENOTEMPTY;
- goto leave;
+
+ item = kzalloc(sizeof(*item), GFP_NOFS);
+ if (!item) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto done;
+ }
+
+ dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
+ item->u.dl.deref_res = res;
+ item->u.dl.deref_node = node;
+
+ spin_lock(&dlm->work_lock);
+ list_add_tail(&item->list, &dlm->work_list);
+ spin_unlock(&dlm->work_lock);
+
+ queue_work(dlm->dlm_worker, &dlm->dispatched_work);
+ return 0;
+
+done:
+ if (res)
+ dlm_lockres_put(res);
+ dlm_put(dlm);
+
+ return ret;
+}
+
+static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
+{
+ struct dlm_ctxt *dlm;
+ struct dlm_lock_resource *res;
+ u8 node;
+ u8 cleared = 0;
+
+ dlm = item->dlm;
+ res = item->u.dl.deref_res;
+ node = item->u.dl.deref_node;
+
+ spin_lock(&res->spinlock);
+ BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ if (test_bit(node, res->refmap)) {
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
+ cleared = 1;
+ }
+ spin_unlock(&res->spinlock);
+
+ if (cleared) {
+ mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
+ dlm->name, res->lockname.len, res->lockname.name, node);
+ dlm_lockres_calc_usage(dlm, res);
+ } else {
+ mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
+ "but it is already dropped!\n", dlm->name,
+ res->lockname.len, res->lockname.name, node);
+ dlm_print_one_lock_resource(res);
+ }
+
+ dlm_lockres_put(res);
+}
+
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
+ */
+static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ enum dlm_lockres_list idx;
+ int nonlocal = 0, node_ref;
+ struct list_head *queue;
+ struct dlm_lock *lock;
+ u64 cookie;
+
+ assert_spin_locked(&res->spinlock);
+
+ /* delay migration when the lockres is in MIGRATING state */
+ if (res->state & DLM_LOCK_RES_MIGRATING)
+ return 0;
+
+ if (res->owner != dlm->node_num)
+ return 0;
+
+ for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+ queue = dlm_list_idx_to_ptr(res, idx);
+ list_for_each_entry(lock, queue, list) {
+ if (lock->ml.node != dlm->node_num) {
+ nonlocal++;
+ continue;
}
+ cookie = be64_to_cpu(lock->ml.cookie);
+ mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+ "%s list\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(cookie),
+ dlm_get_lock_cookie_seq(cookie),
+ dlm_list_in_text(idx));
+ return 0;
}
- queue++;
}
- mlog(0, "all locks on this lockres are nonlocal. continuing\n");
- spin_unlock(&res->spinlock);
- /* no work to do */
- if (empty) {
- mlog(0, "no locks were found on this lockres! done!\n");
- ret = 0;
- goto leave;
+ if (!nonlocal) {
+ node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (node_ref >= O2NM_MAX_NODES)
+ return 0;
}
- /*
- * preallocate up front
- * if this fails, abort
- */
+ mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+
+ return 1;
+}
+/*
+ * DLM_MIGRATE_LOCKRES
+ */
+
+
+static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 target)
+{
+ struct dlm_master_list_entry *mle = NULL;
+ struct dlm_master_list_entry *oldmle = NULL;
+ struct dlm_migratable_lockres *mres = NULL;
+ int ret = 0;
+ const char *name;
+ unsigned int namelen;
+ int mle_added = 0;
+ int wake = 0;
+
+ if (!dlm_grab(dlm))
+ return -EINVAL;
+
+ BUG_ON(target == O2NM_MAX_NODES);
+
+ name = res->lockname.name;
+ namelen = res->lockname.len;
+
+ mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+ target);
+
+ /* preallocate up front. if this fails, abort */
ret = -ENOMEM;
mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
@@ -2181,8 +2474,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
goto leave;
}
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
mlog_errno(ret);
goto leave;
@@ -2190,35 +2482,10 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = 0;
/*
- * find a node to migrate the lockres to
- */
-
- mlog(0, "picking a migration node\n");
- spin_lock(&dlm->spinlock);
- /* pick a new node */
- if (!test_bit(target, dlm->domain_map) ||
- target >= O2NM_MAX_NODES) {
- target = dlm_pick_migration_target(dlm, res);
- }
- mlog(0, "node %u chosen for migration\n", target);
-
- if (target >= O2NM_MAX_NODES ||
- !test_bit(target, dlm->domain_map)) {
- /* target chosen is not alive */
- ret = -EINVAL;
- }
-
- if (ret) {
- spin_unlock(&dlm->spinlock);
- goto fail;
- }
-
- mlog(0, "continuing with target = %u\n", target);
-
- /*
* clear any existing master requests and
* add the migration mle to the list
*/
+ spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
@@ -2241,6 +2508,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
res->lockname.name, target);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_MIGRATING;
+ wake = 1;
spin_unlock(&res->spinlock);
ret = -EINVAL;
}
@@ -2258,6 +2526,7 @@ fail:
dlm_put_mle(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
+ mle = NULL;
}
goto leave;
}
@@ -2268,6 +2537,9 @@ fail:
* the lockres
*/
+ /* now that remote nodes are spinning on the MIGRATING flag,
+ * ensure that all assert_master work is flushed. */
+ flush_workqueue(dlm->dlm_worker);
/* get an extra reference on the mle.
* otherwise the assert_master from the new
@@ -2296,7 +2568,11 @@ fail:
dlm_put_mle_inuse(mle);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_MIGRATING;
+ wake = 1;
spin_unlock(&res->spinlock);
+ if (dlm_is_host_down(ret))
+ dlm_wait_for_node_death(dlm, target,
+ DLM_NODE_DEATH_WAIT_MAX);
goto leave;
}
@@ -2308,7 +2584,7 @@ fail:
* is complete everywhere. if the target dies while this is
* going on, some nodes could potentially see the target as the
* master, so it is important that my recovery finds the migration
- * mle and sets the master to UNKNONWN. */
+ * mle and sets the master to UNKNOWN. */
/* wait for new node to assert master */
@@ -2322,28 +2598,29 @@ fail:
res->owner == target)
break;
- mlog(0, "timed out during migration\n");
- /* avoid hang during shutdown when migrating lockres
+ mlog(0, "%s:%.*s: timed out during migration\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ /* avoid hang during shutdown when migrating lockres
* to a node which also goes down */
if (dlm_is_node_dead(dlm, target)) {
mlog(0, "%s:%.*s: expected migration "
"target %u is no longer up, restarting\n",
dlm->name, res->lockname.len,
res->lockname.name, target);
- ret = -ERESTARTSYS;
+ ret = -EINVAL;
+ /* migration failed, detach and clean up mle */
+ dlm_mle_detach_hb_events(dlm, mle);
+ dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_MIGRATING;
+ wake = 1;
+ spin_unlock(&res->spinlock);
+ goto leave;
}
- }
- if (ret == -ERESTARTSYS) {
- /* migration failed, detach and clean up mle */
- dlm_mle_detach_hb_events(dlm, mle);
- dlm_put_mle(mle);
- dlm_put_mle_inuse(mle);
- spin_lock(&res->spinlock);
- res->state &= ~DLM_LOCK_RES_MIGRATING;
- spin_unlock(&res->spinlock);
- goto leave;
- }
- /* TODO: if node died: stop, clean up, return error */
+ } else
+ mlog(0, "%s:%.*s: caught signal during migration\n",
+ dlm->name, res->lockname.len, res->lockname.name);
}
/* all done, set the owner, clear the flag */
@@ -2366,16 +2643,62 @@ leave:
if (ret < 0)
dlm_kick_thread(dlm, res);
- /* TODO: cleanup */
+ /* wake up waiters if the MIGRATING flag got set
+ * but migration failed */
+ if (wake)
+ wake_up(&res->wq);
+
if (mres)
free_page((unsigned long)mres);
dlm_put(dlm);
- mlog(0, "returning %d\n", ret);
+ mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+ name, target, ret);
return ret;
}
+#define DLM_MIGRATION_RETRY_MS 100
+
+/*
+ * Should be called only after beginning the domain leave process.
+ * There should not be any remaining locks on nonlocal lock resources,
+ * and there should be no local locks left on locally mastered resources.
+ *
+ * Called with the dlm spinlock held, may drop it to do migration, but
+ * will re-acquire before exit.
+ *
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
+int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+{
+ int ret;
+ int lock_dropped = 0;
+ u8 target = O2NM_MAX_NODES;
+
+ assert_spin_locked(&dlm->spinlock);
+
+ spin_lock(&res->spinlock);
+ if (dlm_is_lockres_migrateable(dlm, res))
+ target = dlm_pick_migration_target(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ if (target == O2NM_MAX_NODES)
+ goto leave;
+
+ /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
+ spin_unlock(&dlm->spinlock);
+ lock_dropped = 1;
+ ret = dlm_migrate_lockres(dlm, res, target);
+ if (ret)
+ mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ target, ret);
+ spin_lock(&dlm->spinlock);
+leave:
+ return lock_dropped;
+}
+
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
int ret;
@@ -2396,7 +2719,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
spin_unlock(&res->spinlock);
- /* target has died, so make the caller break out of the
+ /* target has died, so make the caller break out of the
* wait_event, but caller must recheck the domain_map */
spin_lock(&dlm->spinlock);
if (!test_bit(mig_target, dlm->domain_map))
@@ -2405,7 +2728,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
return can_proceed;
}
-int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
+static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
int ret;
spin_lock(&res->spinlock);
@@ -2434,8 +2758,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
__dlm_lockres_reserve_ast(res);
spin_unlock(&res->spinlock);
- /* now flush all the pending asts.. hang out for a bit */
+ /* now flush all the pending asts */
dlm_kick_thread(dlm, res);
+ /* before waiting on DIRTY, block processes which may
+ * try to dirty the lockres before MIGRATING is set */
+ spin_lock(&res->spinlock);
+ BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
+ res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
+ spin_unlock(&res->spinlock);
+ /* now wait on any pending asts and the DIRTY state */
wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
dlm_lockres_release_ast(dlm, res);
@@ -2462,6 +2793,7 @@ again:
goto again;
}
+ ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
@@ -2472,9 +2804,21 @@ again:
spin_unlock(&dlm->spinlock);
/*
+ * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+ * another try; otherwise, we are sure the MIGRATING state is there,
+ * drop the unneded state which blocked threads trying to DIRTY
+ */
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+ res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+ if (!ret)
+ BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ spin_unlock(&res->spinlock);
+
+ /*
* at this point:
*
- * o the DLM_LOCK_RES_MIGRATING flag is set
+ * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
@@ -2488,18 +2832,16 @@ again:
static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- struct list_head *iter, *iter2;
struct list_head *queue = &res->granted;
- int i;
- struct dlm_lock *lock;
+ int i, bit;
+ struct dlm_lock *lock, *next;
assert_spin_locked(&res->spinlock);
BUG_ON(res->owner == dlm->node_num);
for (i=0; i<3; i++) {
- list_for_each_safe(iter, iter2, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, queue, list) {
if (lock->ml.node != dlm->node_num) {
mlog(0, "putting lock for node %u\n",
lock->ml.node);
@@ -2508,65 +2850,83 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* In a normal unlock, we would have added a
+ * DLM_UNLOCK_FREE_LOCK action. Force it. */
+ dlm_lock_put(lock);
}
}
queue++;
}
+ bit = 0;
+ while (1) {
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
+ if (bit >= O2NM_MAX_NODES)
+ break;
+ /* do not clear the local node reference, if there is a
+ * process holding this, let it drop the ref itself */
+ if (bit != dlm->node_num) {
+ mlog(0, "%s:%.*s: node %u had a ref to this "
+ "migrating lockres, clearing\n", dlm->name,
+ res->lockname.len, res->lockname.name, bit);
+ dlm_lockres_clear_refmap_bit(dlm, res, bit);
+ }
+ bit++;
+ }
}
-/* for now this is not too intelligent. we will
- * need stats to make this do the right thing.
- * this just finds the first lock on one of the
- * queues and uses that node as the target. */
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- int i;
+ enum dlm_lockres_list idx;
struct list_head *queue = &res->granted;
- struct list_head *iter;
struct dlm_lock *lock;
- int nodenum;
+ int noderef;
+ u8 nodenum = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
- spin_lock(&res->spinlock);
- for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
- /* up to the caller to make sure this node
- * is alive */
- lock = list_entry (iter, struct dlm_lock, list);
- if (lock->ml.node != dlm->node_num) {
- spin_unlock(&res->spinlock);
- return lock->ml.node;
- }
+ /* Go through all the locks */
+ for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+ queue = dlm_list_idx_to_ptr(res, idx);
+ list_for_each_entry(lock, queue, list) {
+ if (lock->ml.node == dlm->node_num)
+ continue;
+ if (test_bit(lock->ml.node, dlm->exit_domain_map))
+ continue;
+ nodenum = lock->ml.node;
+ goto bail;
}
- queue++;
}
- spin_unlock(&res->spinlock);
- mlog(0, "have not found a suitable target yet! checking domain map\n");
- /* ok now we're getting desperate. pick anyone alive. */
- nodenum = -1;
+ /* Go thru the refmap */
+ noderef = -1;
while (1) {
- nodenum = find_next_bit(dlm->domain_map,
- O2NM_MAX_NODES, nodenum+1);
- mlog(0, "found %d in domain map\n", nodenum);
- if (nodenum >= O2NM_MAX_NODES)
+ noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+ noderef + 1);
+ if (noderef >= O2NM_MAX_NODES)
break;
- if (nodenum != dlm->node_num) {
- mlog(0, "picking %d\n", nodenum);
- return nodenum;
- }
+ if (noderef == dlm->node_num)
+ continue;
+ if (test_bit(noderef, dlm->exit_domain_map))
+ continue;
+ nodenum = noderef;
+ goto bail;
}
- mlog(0, "giving up. no master to migrate to\n");
- return DLM_LOCK_RES_OWNER_UNKNOWN;
+bail:
+ return nodenum;
}
-
-
/* this is called by the new master once all lockres
* data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
@@ -2575,7 +2935,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
struct dlm_node_iter *iter)
{
struct dlm_migrate_request migrate;
- int ret, status = 0;
+ int ret, skip, status = 0;
int nodenum;
memset(&migrate, 0, sizeof(migrate));
@@ -2592,15 +2952,42 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
nodenum == new_master)
continue;
+ /* We could race exit domain. If exited, skip. */
+ spin_lock(&dlm->spinlock);
+ skip = (!test_bit(nodenum, dlm->domain_map));
+ spin_unlock(&dlm->spinlock);
+ if (skip) {
+ clear_bit(nodenum, iter->node_map);
+ continue;
+ }
+
ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
&migrate, sizeof(migrate), nodenum,
&status);
- if (ret < 0)
- mlog_errno(ret);
- else if (status < 0) {
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+ "MIGRATE_REQUEST to node %u\n", dlm->name,
+ migrate.namelen, migrate.name, ret, nodenum);
+ if (!dlm_is_host_down(ret)) {
+ mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+ BUG();
+ }
+ clear_bit(nodenum, iter->node_map);
+ ret = 0;
+ } else if (status < 0) {
mlog(0, "migrate request (node %u) returned %d!\n",
nodenum, status);
ret = status;
+ } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
+ /* during the migration request we short-circuited
+ * the mastery of the lockres. make sure we have
+ * a mastery ref for nodenum */
+ mlog(0, "%s:%.*s: need ref for node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ nodenum);
+ spin_lock(&res->spinlock);
+ dlm_lockres_set_refmap_bit(dlm, res, nodenum);
+ spin_unlock(&res->spinlock);
}
}
@@ -2619,7 +3006,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
* we will have no mle in the list to start with. now we can add an mle for
* the migration and this should be the only one found for those scanning the
* list. */
-int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_lock_resource *res = NULL;
@@ -2637,8 +3025,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
hash = dlm_lockid_hash(name, namelen);
/* preallocate.. if this fails, abort */
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
ret = -ENOMEM;
@@ -2648,8 +3035,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
- spin_lock(&dlm->master_lock);
-
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -2667,14 +3052,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
spin_unlock(&res->spinlock);
}
+ spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
-unlock:
spin_unlock(&dlm->master_lock);
+unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
@@ -2709,8 +3095,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
*oldmle = NULL;
- mlog_entry_void();
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
@@ -2742,10 +3126,18 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
tmp->master = master;
atomic_set(&tmp->woken, 1);
wake_up(&tmp->wq);
- /* remove it from the list so that only one
- * mle will be found */
- list_del_init(&tmp->list);
- __dlm_mle_detach_hb_events(dlm, mle);
+ /* remove it so that only one mle will be found */
+ __dlm_unlink_mle(dlm, tmp);
+ __dlm_mle_detach_hb_events(dlm, tmp);
+ if (tmp->type == DLM_MLE_MASTER) {
+ ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+ "telling master to get ref "
+ "for cleared out mle during "
+ "migration\n", dlm->name,
+ namelen, name, master,
+ new_master);
+ }
}
spin_unlock(&tmp->spinlock);
}
@@ -2753,143 +3145,166 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* now add a migration mle to the tail of the list */
dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
mle->new_master = new_master;
+ /* the new master will be sending an assert master for this.
+ * at that point we will get the refmap reference */
mle->master = master;
/* do this for consistency with other mle types */
set_bit(new_master, mle->maybe_map);
- list_add(&mle->list, &dlm->master_list);
+ __dlm_insert_mle(dlm, mle);
return ret;
}
-
-void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+/*
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+ struct dlm_master_list_entry *mle)
{
- struct list_head *iter, *iter2;
- struct dlm_master_list_entry *mle;
struct dlm_lock_resource *res;
- unsigned int hash;
- mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
-top:
- assert_spin_locked(&dlm->spinlock);
+ /* Find the lockres associated to the mle and set its owner to UNK */
+ res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
+ mle->mnamehash);
+ if (res) {
+ spin_unlock(&dlm->master_lock);
- /* clean the master list */
- spin_lock(&dlm->master_lock);
- list_for_each_safe(iter, iter2, &dlm->master_list) {
- mle = list_entry(iter, struct dlm_master_list_entry, list);
+ /* move lockres onto recovery list */
+ spin_lock(&res->spinlock);
+ dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+ dlm_move_lockres_to_recovery_list(dlm, res);
+ spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
- BUG_ON(mle->type != DLM_MLE_BLOCK &&
- mle->type != DLM_MLE_MASTER &&
- mle->type != DLM_MLE_MIGRATION);
+ /* about to get rid of mle, detach from heartbeat */
+ __dlm_mle_detach_hb_events(dlm, mle);
- /* MASTER mles are initiated locally. the waiting
- * process will notice the node map change
- * shortly. let that happen as normal. */
- if (mle->type == DLM_MLE_MASTER)
- continue;
+ /* dump the mle */
+ spin_lock(&dlm->master_lock);
+ __dlm_put_mle(mle);
+ spin_unlock(&dlm->master_lock);
+ }
+ return res;
+}
- /* BLOCK mles are initiated by other nodes.
- * need to clean up if the dead node would have
- * been the master. */
- if (mle->type == DLM_MLE_BLOCK) {
- int bit;
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+ struct dlm_master_list_entry *mle)
+{
+ __dlm_mle_detach_hb_events(dlm, mle);
- spin_lock(&mle->spinlock);
- bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
- if (bit != dead_node) {
- mlog(0, "mle found, but dead node %u would "
- "not have been master\n", dead_node);
- spin_unlock(&mle->spinlock);
- } else {
- /* must drop the refcount by one since the
- * assert_master will never arrive. this
- * may result in the mle being unlinked and
- * freed, but there may still be a process
- * waiting in the dlmlock path which is fine. */
- mlog(0, "node %u was expected master\n",
- dead_node);
- atomic_set(&mle->woken, 1);
- spin_unlock(&mle->spinlock);
- wake_up(&mle->wq);
- /* do not need events any longer, so detach
- * from heartbeat */
- __dlm_mle_detach_hb_events(dlm, mle);
- __dlm_put_mle(mle);
- }
- continue;
- }
+ spin_lock(&mle->spinlock);
+ __dlm_unlink_mle(dlm, mle);
+ atomic_set(&mle->woken, 1);
+ spin_unlock(&mle->spinlock);
- /* everything else is a MIGRATION mle */
-
- /* the rule for MIGRATION mles is that the master
- * becomes UNKNOWN if *either* the original or
- * the new master dies. all UNKNOWN lockreses
- * are sent to whichever node becomes the recovery
- * master. the new master is responsible for
- * determining if there is still a master for
- * this lockres, or if he needs to take over
- * mastery. either way, this node should expect
- * another message to resolve this. */
- if (mle->master != dead_node &&
- mle->new_master != dead_node)
- continue;
+ wake_up(&mle->wq);
+}
- /* if we have reached this point, this mle needs to
- * be removed from the list and freed. */
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
+ struct dlm_master_list_entry *mle, u8 dead_node)
+{
+ int bit;
- /* remove from the list early. NOTE: unlinking
- * list_head while in list_for_each_safe */
- __dlm_mle_detach_hb_events(dlm, mle);
- spin_lock(&mle->spinlock);
- list_del_init(&mle->list);
+ BUG_ON(mle->type != DLM_MLE_BLOCK);
+
+ spin_lock(&mle->spinlock);
+ bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+ if (bit != dead_node) {
+ mlog(0, "mle found, but dead node %u would not have been "
+ "master\n", dead_node);
+ spin_unlock(&mle->spinlock);
+ } else {
+ /* Must drop the refcount by one since the assert_master will
+ * never arrive. This may result in the mle being unlinked and
+ * freed, but there may still be a process waiting in the
+ * dlmlock path which is fine. */
+ mlog(0, "node %u was expected master\n", dead_node);
atomic_set(&mle->woken, 1);
spin_unlock(&mle->spinlock);
wake_up(&mle->wq);
- mlog(0, "%s: node %u died during migration from "
- "%u to %u!\n", dlm->name, dead_node,
- mle->master, mle->new_master);
- /* if there is a lockres associated with this
- * mle, find it and set its owner to UNKNOWN */
- hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
- res = __dlm_lookup_lockres(dlm, mle->u.name.name,
- mle->u.name.len, hash);
- if (res) {
- /* unfortunately if we hit this rare case, our
- * lock ordering is messed. we need to drop
- * the master lock so that we can take the
- * lockres lock, meaning that we will have to
- * restart from the head of list. */
- spin_unlock(&dlm->master_lock);
+ /* Do not need events any longer, so detach from heartbeat */
+ __dlm_mle_detach_hb_events(dlm, mle);
+ __dlm_put_mle(mle);
+ }
+}
- /* move lockres onto recovery list */
- spin_lock(&res->spinlock);
- dlm_set_lockres_owner(dlm, res,
- DLM_LOCK_RES_OWNER_UNKNOWN);
- dlm_move_lockres_to_recovery_list(dlm, res);
- spin_unlock(&res->spinlock);
- dlm_lockres_put(res);
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+ struct dlm_master_list_entry *mle;
+ struct dlm_lock_resource *res;
+ struct hlist_head *bucket;
+ struct hlist_node *tmp;
+ unsigned int i;
- /* about to get rid of mle, detach from heartbeat */
- __dlm_mle_detach_hb_events(dlm, mle);
+ mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+ assert_spin_locked(&dlm->spinlock);
- /* dump the mle */
- spin_lock(&dlm->master_lock);
- __dlm_put_mle(mle);
- spin_unlock(&dlm->master_lock);
+ /* clean the master list */
+ spin_lock(&dlm->master_lock);
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = dlm_master_hash(dlm, i);
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
+ BUG_ON(mle->type != DLM_MLE_BLOCK &&
+ mle->type != DLM_MLE_MASTER &&
+ mle->type != DLM_MLE_MIGRATION);
+
+ /* MASTER mles are initiated locally. The waiting
+ * process will notice the node map change shortly.
+ * Let that happen as normal. */
+ if (mle->type == DLM_MLE_MASTER)
+ continue;
+
+ /* BLOCK mles are initiated by other nodes. Need to
+ * clean up if the dead node would have been the
+ * master. */
+ if (mle->type == DLM_MLE_BLOCK) {
+ dlm_clean_block_mle(dlm, mle, dead_node);
+ continue;
+ }
- /* restart */
- goto top;
- }
+ /* Everything else is a MIGRATION mle */
+
+ /* The rule for MIGRATION mles is that the master
+ * becomes UNKNOWN if *either* the original or the new
+ * master dies. All UNKNOWN lockres' are sent to
+ * whichever node becomes the recovery master. The new
+ * master is responsible for determining if there is
+ * still a master for this lockres, or if he needs to
+ * take over mastery. Either way, this node should
+ * expect another message to resolve this. */
+
+ if (mle->master != dead_node &&
+ mle->new_master != dead_node)
+ continue;
+
+ /* If we have reached this point, this mle needs to be
+ * removed from the list and freed. */
+ dlm_clean_migration_mle(dlm, mle);
+
+ mlog(0, "%s: node %u died during migration from "
+ "%u to %u!\n", dlm->name, dead_node, mle->master,
+ mle->new_master);
+
+ /* If we find a lockres associated with the mle, we've
+ * hit this rare case that messes up our lock ordering.
+ * If so, we need to drop the master lock so that we can
+ * take the lockres lock, meaning that we will have to
+ * restart from the head of list. */
+ res = dlm_reset_mleres_owner(dlm, mle);
+ if (res)
+ /* restart */
+ goto top;
- /* this may be the last reference */
- __dlm_put_mle(mle);
+ /* This may be the last reference */
+ __dlm_put_mle(mle);
+ }
}
spin_unlock(&dlm->master_lock);
}
-
int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 old_master)
{
@@ -2902,6 +3317,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
clear_bit(dlm->node_num, iter.node_map);
spin_unlock(&dlm->spinlock);
+ /* ownership of the lockres is changing. account for the
+ * mastery reference here since old_master will briefly have
+ * a reference after the migration completes */
+ spin_lock(&res->spinlock);
+ dlm_lockres_set_refmap_bit(dlm, res, old_master);
+ spin_unlock(&res->spinlock);
+
mlog(0, "now time to do a migrate request to other nodes\n");
ret = dlm_do_migrate_request(dlm, res, old_master,
dlm->node_num, &iter);
@@ -2914,8 +3336,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
res->lockname.len, res->lockname.name);
/* this call now finishes out the nodemap
* even if one or more nodes die */
- ret = dlm_do_assert_master(dlm, res->lockname.name,
- res->lockname.len, iter.node_map,
+ ret = dlm_do_assert_master(dlm, res, iter.node_map,
DLM_ASSERT_MASTER_FINISH_MIGRATION);
if (ret < 0) {
/* no longer need to retry. all living nodes contacted. */
@@ -2927,8 +3348,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
- ret = dlm_do_assert_master(dlm, res->lockname.name,
- res->lockname.len, iter.node_map,
+ ret = dlm_do_assert_master(dlm, res, iter.node_map,
DLM_ASSERT_MASTER_FINISH_MIGRATION);
if (ret < 0) {
mlog(0, "assert master to original master failed "
@@ -3001,3 +3421,41 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
wake_up(&res->wq);
wake_up(&dlm->migration_wq);
}
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+ int i;
+ struct hlist_head *bucket;
+ struct dlm_master_list_entry *mle;
+ struct hlist_node *tmp;
+
+ /*
+ * We notified all other nodes that we are exiting the domain and
+ * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+ * around we force free them and wake any processes that are waiting
+ * on the mles
+ */
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+
+ BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+ BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = dlm_master_hash(dlm, i);
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
+ if (mle->type != DLM_MLE_BLOCK) {
+ mlog(ML_ERROR, "bad mle: %p\n", mle);
+ dlm_print_one_mle(mle);
+ }
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+
+ __dlm_unlink_mle(dlm, mle);
+ __dlm_mle_detach_hb_events(dlm, mle);
+ __dlm_put_mle(mle);
+ }
+ }
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9d950d7cea3..45067faf569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -56,9 +55,6 @@
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
static int dlm_do_recovery(struct dlm_ctxt *dlm);
static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -153,29 +149,25 @@ static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
}
/* Worker function used during recovery. */
-void dlm_dispatch_work(void *data)
+void dlm_dispatch_work(struct work_struct *work)
{
- struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
+ struct dlm_ctxt *dlm =
+ container_of(work, struct dlm_ctxt, dispatched_work);
LIST_HEAD(tmp_list);
- struct list_head *iter, *iter2;
- struct dlm_work_item *item;
+ struct dlm_work_item *item, *next;
dlm_workfunc_t *workfunc;
int tot=0;
- if (!dlm_joined(dlm))
- return;
-
spin_lock(&dlm->work_lock);
list_splice_init(&dlm->work_list, &tmp_list);
spin_unlock(&dlm->work_lock);
- list_for_each_safe(iter, iter2, &tmp_list) {
+ list_for_each_entry(item, &tmp_list, list) {
tot++;
}
mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
- list_for_each_safe(iter, iter2, &tmp_list) {
- item = list_entry(iter, struct dlm_work_item, list);
+ list_for_each_entry_safe(item, next, &tmp_list, list) {
workfunc = item->func;
list_del_init(&item->list);
@@ -263,7 +255,7 @@ static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
struct dlm_lock_resource *res;
mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
- dlm->name, dlm->dlm_reco_thread_task->pid,
+ dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
dlm->reco.dead_node, dlm->reco.new_master);
@@ -315,7 +307,7 @@ static int dlm_recovery_thread(void *data)
mlog(0, "dlm thread running for %s...\n", dlm->name);
while (!kthread_should_stop()) {
- if (dlm_joined(dlm)) {
+ if (dlm_domain_fully_joined(dlm)) {
status = dlm_do_recovery(dlm);
if (status == -EAGAIN) {
/* do not sleep, recheck immediately. */
@@ -367,40 +359,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
}
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(ML_NOTICE, "%s: waiting %dms for notification of "
- "death of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_dead(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
- "of death of node %u\n", dlm->name, node);
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(0, "%s: waiting %dms for notification of "
- "recovery of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_recovered(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_recovered(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(0, "%s: waiting indefinitely for notification "
- "of recovery of node %u\n", dlm->name, node);
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_recovered(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -424,7 +414,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
if (dlm_in_recovery(dlm)) {
mlog(0, "%s: reco thread %d in recovery: "
"state=%d, master=%u, dead=%u\n",
- dlm->name, dlm->dlm_reco_thread_task->pid,
+ dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.state, dlm->reco.new_master,
dlm->reco.dead_node);
}
@@ -435,6 +425,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+ printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+ dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
}
@@ -445,9 +437,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
+ printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
wake_up(&dlm->reco.event);
}
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+ printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+ "dead node %u in domain %s\n", dlm->reco.new_master,
+ (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+ dlm->reco.dead_node, dlm->name);
+}
+
static int dlm_do_recovery(struct dlm_ctxt *dlm)
{
int status = 0;
@@ -468,7 +469,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+ bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
@@ -487,7 +488,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
}
mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
- dlm->name, dlm->dlm_reco_thread_task->pid,
+ dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.dead_node);
spin_unlock(&dlm->spinlock);
@@ -510,9 +511,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
- dlm->node_num, dlm->reco.dead_node);
+
+ dlm_print_recovery_master(dlm);
/* it is safe to start everything back up here
* because all of the dead node's lock resources
@@ -523,15 +523,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
- dlm->dlm_reco_thread_task->pid,
- dlm->name, dlm->reco.dead_node, dlm->node_num);
+ dlm_print_recovery_master(dlm);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
/* we should never hit this anymore */
- mlog(ML_ERROR, "error %d remastering locks for node %u, "
- "retrying.\n", status, dlm->reco.dead_node);
+ mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+ "retrying.\n", dlm->name, status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
* to get handled on remaining nodes */
msleep(100);
@@ -539,7 +537,10 @@ master_here:
/* success! see if any other nodes need recovery */
mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
dlm->name, dlm->reco.dead_node, dlm->node_num);
- dlm_reset_recovery(dlm);
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
}
dlm_end_recovery(dlm);
@@ -551,7 +552,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
{
int status = 0;
struct dlm_reco_node_data *ndata;
- struct list_head *iter;
int all_nodes_done;
int destroy = 0;
int pass = 0;
@@ -569,12 +569,11 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
/* safe to access the node data list without a lock, since this
* process is the only one to change the list */
- list_for_each(iter, &dlm->reco.node_data) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
- mlog(0, "requesting lock info from node %u\n",
+ mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
ndata->node_num);
if (ndata->node_num == dlm->node_num) {
@@ -613,6 +612,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
} while (status != 0);
+ spin_lock(&dlm_reco_state_lock);
switch (ndata->state) {
case DLM_RECO_NODE_DATA_INIT:
case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -643,9 +643,10 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
ndata->node_num, dead_node);
break;
}
+ spin_unlock(&dlm_reco_state_lock);
}
- mlog(0, "done requesting all lock info\n");
+ mlog(0, "%s: Done requesting all lock info\n", dlm->name);
/* nodes should be sending reco data now
* just need to wait */
@@ -655,9 +656,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
* done, or if anyone died */
all_nodes_done = 1;
spin_lock(&dlm_reco_state_lock);
- list_for_each(iter, &dlm->reco.node_data) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
-
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
mlog(0, "checking recovery state of node %u\n",
ndata->node_num);
switch (ndata->state) {
@@ -699,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (all_nodes_done) {
int ret;
+ /* Set this flag on recovery master to avoid
+ * a new recovery for another dead node start
+ * before the recovery is not done. That may
+ * cause recovery hung.*/
+ spin_lock(&dlm->spinlock);
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
* just send a finalize message to everyone and
* clean up */
@@ -734,7 +741,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (destroy)
dlm_destroy_recovery_area(dlm, dead_node);
- mlog_exit(status);
return status;
}
@@ -756,7 +762,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
}
BUG_ON(num == dead_node);
- ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
+ ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
if (!ndata) {
dlm_destroy_recovery_area(dlm, dead_node);
return -ENOMEM;
@@ -774,16 +780,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct list_head *iter, *iter2;
- struct dlm_reco_node_data *ndata;
+ struct dlm_reco_node_data *ndata, *next;
LIST_HEAD(tmplist);
spin_lock(&dlm_reco_state_lock);
list_splice_init(&dlm->reco.node_data, &tmplist);
spin_unlock(&dlm_reco_state_lock);
- list_for_each_safe(iter, iter2, &tmplist) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
+ list_for_each_entry_safe(ndata, next, &tmplist, list) {
list_del_init(&ndata->list);
kfree(ndata);
}
@@ -793,7 +797,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
u8 dead_node)
{
struct dlm_lock_request lr;
- enum dlm_status ret;
+ int ret;
+ int status;
mlog(0, "\n");
@@ -806,21 +811,24 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
lr.dead_node = dead_node;
// send message
- ret = DLM_NOLOCKMGR;
ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
- &lr, sizeof(lr), request_from, NULL);
+ &lr, sizeof(lr), request_from, &status);
/* negative status is handled by caller */
if (ret < 0)
- mlog_errno(ret);
-
+ mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+ "to recover dead node %u\n", dlm->name, ret,
+ request_from, dead_node);
+ else
+ ret = status;
// return from here, then
// sleep until all received or error
return ret;
}
-int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
@@ -841,7 +849,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
}
BUG_ON(lr->dead_node != dlm->reco.dead_node);
- item = kcalloc(1, sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (!item) {
dlm_put(dlm);
return -ENOMEM;
@@ -875,7 +883,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
struct dlm_lock_resource *res;
struct dlm_ctxt *dlm;
LIST_HEAD(resources);
- struct list_head *iter;
int ret;
u8 dead_node, reco_master;
int skip_all_done = 0;
@@ -919,8 +926,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
/* any errors returned will be due to the new_master dying,
* the dlm_reco_thread should detect this */
- list_for_each(iter, &resources) {
- res = list_entry (iter, struct dlm_lock_resource, recovering);
+ list_for_each_entry(res, &resources, recovering) {
ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
DLM_MRES_RECOVERY);
if (ret < 0) {
@@ -965,10 +971,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
+ mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+ "to recover dead node %u\n", dlm->name, ret, send_to,
+ dead_node);
if (!dlm_is_host_down(ret)) {
- mlog_errno(ret);
- mlog(ML_ERROR, "%s: unknown error sending data-done "
- "to %u\n", dlm->name, send_to);
BUG();
}
} else
@@ -977,11 +983,11 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
}
-int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
- struct list_head *iter;
struct dlm_reco_node_data *ndata = NULL;
int ret = -EINVAL;
@@ -998,8 +1004,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
dlm->reco.dead_node, done->node_idx, dlm->node_num);
spin_lock(&dlm_reco_state_lock);
- list_for_each(iter, &dlm->reco.node_data) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
if (ndata->node_num != done->node_idx)
continue;
@@ -1047,13 +1052,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
struct list_head *list,
u8 dead_node)
{
- struct dlm_lock_resource *res;
- struct list_head *iter, *iter2;
+ struct dlm_lock_resource *res, *next;
struct dlm_lock *lock;
spin_lock(&dlm->spinlock);
- list_for_each_safe(iter, iter2, &dlm->reco.resources) {
- res = list_entry (iter, struct dlm_lock_resource, recovering);
+ list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -1063,7 +1066,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
if (lock->ml.node == dead_node) {
mlog(0, "AHA! there was "
"a $RECOVERY lock for dead "
- "node %u (%s)!\n",
+ "node %u (%s)!\n",
dead_node, dlm->name);
list_del_init(&lock->list);
dlm_lock_put(lock);
@@ -1128,13 +1131,22 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (total_locks == mres_total_locks)
mres->flags |= DLM_MRES_ALL_DONE;
+ mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
+ send_to);
+
/* send it */
ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
sz, send_to, &status);
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+ "node %u (%s)\n", dlm->name, mres->lockname_len,
+ mres->lockname, ret, send_to,
+ (orig_flags & DLM_MRES_MIGRATION ?
+ "migration" : "recovery"));
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1162,7 +1174,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
u8 flags, u8 master)
{
/* mres here is one full page */
- memset(mres, 0, PAGE_SIZE);
+ clear_page(mres);
mres->lockname_len = namelen;
memcpy(mres->lockname, lockname, namelen);
mres->num_locks = 0;
@@ -1172,6 +1184,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
mres->master = master;
}
+static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
+ struct dlm_migratable_lockres *mres,
+ int queue)
+{
+ if (!lock->lksb)
+ return;
+
+ /* Ignore lvb in all locks in the blocked list */
+ if (queue == DLM_BLOCKED_LIST)
+ return;
+
+ /* Only consider lvbs in locks with granted EX or PR lock levels */
+ if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
+ return;
+
+ if (dlm_lvb_is_empty(mres->lvb)) {
+ memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+ return;
+ }
+
+ /* Ensure the lvb copied for migration matches in other valid locks */
+ if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
+ return;
+
+ mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
+ "node=%u\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->lockres->lockname.len, lock->lockres->lockname.name,
+ lock->ml.node);
+ dlm_print_one_lock_resource(lock->lockres);
+ BUG();
+}
/* returns 1 if this lock fills the network structure,
* 0 otherwise */
@@ -1189,20 +1234,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
ml->list = queue;
if (lock->lksb) {
ml->flags = lock->lksb->flags;
- /* send our current lvb */
- if (ml->type == LKM_EXMODE ||
- ml->type == LKM_PRMODE) {
- /* if it is already set, this had better be a PR
- * and it has to match */
- if (!dlm_lvb_is_empty(mres->lvb) &&
- (ml->type == LKM_EXMODE ||
- memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
- mlog(ML_ERROR, "mismatched lvbs!\n");
- __dlm_print_one_lock_resource(lock->lockres);
- BUG();
- }
- memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
- }
+ dlm_prepare_lvb_for_migration(lock, mres, queue);
}
ml->node = lock->ml.node;
mres->num_locks++;
@@ -1212,12 +1244,40 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
return 0;
}
+static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
+ struct dlm_migratable_lockres *mres)
+{
+ struct dlm_lock dummy;
+ memset(&dummy, 0, sizeof(dummy));
+ dummy.ml.cookie = 0;
+ dummy.ml.type = LKM_IVMODE;
+ dummy.ml.convert_type = LKM_IVMODE;
+ dummy.ml.highest_blocked = LKM_IVMODE;
+ dummy.lksb = NULL;
+ dummy.ml.node = dlm->node_num;
+ dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
+}
+
+static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
+ struct dlm_migratable_lock *ml,
+ u8 *nodenum)
+{
+ if (unlikely(ml->cookie == 0 &&
+ ml->type == LKM_IVMODE &&
+ ml->convert_type == LKM_IVMODE &&
+ ml->highest_blocked == LKM_IVMODE &&
+ ml->list == DLM_BLOCKED_LIST)) {
+ *nodenum = ml->node;
+ return 1;
+ }
+ return 0;
+}
int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_migratable_lockres *mres,
u8 send_to, u8 flags)
{
- struct list_head *queue, *iter;
+ struct list_head *queue;
int total_locks, i;
u64 mig_cookie = 0;
struct dlm_lock *lock;
@@ -1243,9 +1303,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
total_locks = 0;
for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
queue = dlm_list_idx_to_ptr(res, i);
- list_for_each(iter, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
-
+ list_for_each_entry(lock, queue, list) {
/* add another lock. */
total_locks++;
if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1259,6 +1317,14 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
goto error;
}
}
+ if (total_locks == 0) {
+ /* send a dummy lock to indicate a mastery reference only */
+ mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
+ "migration");
+ dlm_add_dummy_lock(dlm, mres);
+ }
/* flush any remaining locks */
ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
if (ret < 0)
@@ -1292,13 +1358,15 @@ error:
* do we spin? returning an error only delays the problem really
*/
-int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_migratable_lockres *mres =
(struct dlm_migratable_lockres *)msg->buf;
int ret = 0;
u8 real_master;
+ u8 extra_refs = 0;
char *buf = NULL;
struct dlm_work_item *item = NULL;
struct dlm_lock_resource *res = NULL;
@@ -1322,7 +1390,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
ret = -ENOMEM;
buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
- item = kcalloc(1, sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (!buf || !item)
goto leave;
@@ -1350,6 +1418,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
@@ -1376,22 +1445,38 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
__dlm_insert_lockres(dlm, res);
spin_unlock(&dlm->spinlock);
+ /* Add an extra ref for this lock-less lockres lest the
+ * dlm_thread purges it before we get the chance to add
+ * locks to it */
+ dlm_lockres_get(res);
+
+ /* There are three refs that need to be put.
+ * 1. Taken above.
+ * 2. kref_init in dlm_new_lockres()->dlm_init_lockres().
+ * 3. dlm_lookup_lockres()
+ * The first one is handled at the end of this function. The
+ * other two are handled in the worker thread after locks have
+ * been attached. Yes, we don't wait for purge time to match
+ * kref_init. The lockres will still have atleast one ref
+ * added because it is in the hash __dlm_insert_lockres() */
+ extra_refs++;
+
/* now that the new lockres is inserted,
* make it usable by other processes */
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
-
- /* add an extra ref for just-allocated lockres
- * otherwise the lockres will be purged immediately */
- dlm_lockres_get(res);
-
+ wake_up(&res->wq);
}
/* at this point we have allocated everything we need,
* and we have a hashed lockres with an extra ref and
* the proper res->state flags. */
ret = 0;
+ spin_lock(&res->spinlock);
+ /* drop this either when master requery finds a different master
+ * or when a lock is added by the recovery worker */
+ dlm_lockres_grab_inflight_ref(dlm, res);
if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
/* migration cannot have an unknown master */
BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
@@ -1399,10 +1484,11 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
"unknown owner.. will need to requery: "
"%.*s\n", mres->lockname_len, mres->lockname);
} else {
- spin_lock(&res->spinlock);
+ /* take a reference now to pin the lockres, drop it
+ * when locks are added in the worker */
dlm_change_lockres_owner(dlm, res, dlm->node_num);
- spin_unlock(&res->spinlock);
}
+ spin_unlock(&res->spinlock);
/* queue up work for dlm_mig_lockres_worker */
dlm_grab(dlm); /* get an extra ref for the work item */
@@ -1410,38 +1496,43 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
item->u.ml.lockres = res; /* already have a ref */
item->u.ml.real_master = real_master;
+ item->u.ml.extra_ref = extra_refs;
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
leave:
+ /* One extra ref taken needs to be put here */
+ if (extra_refs)
+ dlm_lockres_put(res);
+
dlm_put(dlm);
if (ret < 0) {
- if (buf)
- kfree(buf);
- if (item)
- kfree(item);
+ kfree(buf);
+ kfree(item);
+ mlog_errno(ret);
}
- mlog_exit(ret);
return ret;
}
static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
{
- struct dlm_ctxt *dlm = data;
+ struct dlm_ctxt *dlm;
struct dlm_migratable_lockres *mres;
int ret = 0;
struct dlm_lock_resource *res;
u8 real_master;
+ u8 extra_ref;
dlm = item->dlm;
mres = (struct dlm_migratable_lockres *)data;
res = item->u.ml.lockres;
real_master = item->u.ml.real_master;
+ extra_ref = item->u.ml.extra_ref;
if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
/* this case is super-rare. only occurs if
@@ -1458,6 +1549,9 @@ again:
"this node will take it.\n",
res->lockname.len, res->lockname.name);
} else {
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
mlog(0, "master needs to respond to sender "
"that node %u still owns %.*s\n",
real_master, res->lockname.len,
@@ -1481,8 +1575,13 @@ again:
}
leave:
+ /* See comment in dlm_mig_lockres_handler() */
+ if (res) {
+ if (extra_ref)
+ dlm_lockres_put(res);
+ dlm_lockres_put(res);
+ }
kfree(data);
- mlog_exit(ret);
}
@@ -1561,7 +1660,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
&req, sizeof(req), nodenum, &status);
/* XXX: negative status not handled properly here. */
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+ dlm->key, nodenum);
else {
BUG_ON(status < 0);
BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1577,7 +1678,8 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
/* this function cannot error, so unless the sending
* or receiving of the message failed, the owner can
* be trusted */
-int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
@@ -1606,8 +1708,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
mlog_errno(-ENOMEM);
/* retry!? */
BUG();
- }
- }
+ } else
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ } else /* put.. incase we are not the master */
+ dlm_lockres_put(res);
spin_unlock(&res->spinlock);
}
spin_unlock(&dlm->spinlock);
@@ -1658,22 +1762,39 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
+ struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
- int i, bad;
- struct list_head *iter;
- struct dlm_lock *lock = NULL;
+ int i, j, bad;
+ struct dlm_lock *lock;
+ u8 from = O2NM_MAX_NODES;
+ unsigned int added = 0;
+ __be64 c;
mlog(0, "running %d locks for this lockres\n", mres->num_locks);
for (i=0; i<mres->num_locks; i++) {
ml = &(mres->ml[i]);
+
+ if (dlm_is_dummy_lock(dlm, ml, &from)) {
+ /* placeholder, just need to set the refmap bit */
+ BUG_ON(mres->num_locks != 1);
+ mlog(0, "%s:%.*s: dummy lock for %u\n",
+ dlm->name, mres->lockname_len, mres->lockname,
+ from);
+ spin_lock(&res->spinlock);
+ dlm_lockres_set_refmap_bit(dlm, res, from);
+ spin_unlock(&res->spinlock);
+ added++;
+ break;
+ }
BUG_ON(ml->highest_blocked != LKM_IVMODE);
newlock = NULL;
lksb = NULL;
queue = dlm_list_num_to_pointer(res, ml->list);
+ tmpq = NULL;
/* if the lock is for the local node it needs to
* be moved to the proper location within the queue.
@@ -1682,26 +1803,69 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* MIGRATION ONLY! */
BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+ lock = NULL;
spin_lock(&res->spinlock);
- list_for_each(iter, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
- if (lock->ml.cookie != ml->cookie)
+ for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
+ tmpq = dlm_list_idx_to_ptr(res, j);
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
+ if (lock->ml.cookie == ml->cookie)
+ break;
lock = NULL;
- else
+ }
+ if (lock)
break;
}
/* lock is always created locally first, and
* destroyed locally last. it must be on the list */
if (!lock) {
- u64 c = ml->cookie;
- mlog(ML_ERROR, "could not find local lock "
- "with cookie %u:%llu!\n",
- dlm_get_lock_cookie_node(c),
- dlm_get_lock_cookie_seq(c));
+ c = ml->cookie;
+ mlog(ML_ERROR, "Could not find local lock "
+ "with cookie %u:%llu, node %u, "
+ "list %u, flags 0x%x, type %d, "
+ "conv %d, highest blocked %d\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ ml->node, ml->list, ml->flags, ml->type,
+ ml->convert_type, ml->highest_blocked);
+ __dlm_print_one_lock_resource(res);
BUG();
}
- BUG_ON(lock->ml.node != ml->node);
+
+ if (lock->ml.node != ml->node) {
+ c = lock->ml.cookie;
+ mlog(ML_ERROR, "Mismatched node# in lock "
+ "cookie %u:%llu, name %.*s, node %u\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ res->lockname.len, res->lockname.name,
+ lock->ml.node);
+ c = ml->cookie;
+ mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
+ "node %u, list %u, flags 0x%x, type %d, "
+ "conv %d, highest blocked %d\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ ml->node, ml->list, ml->flags, ml->type,
+ ml->convert_type, ml->highest_blocked);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ if (tmpq != queue) {
+ c = ml->cookie;
+ mlog(0, "Lock cookie %u:%llu was on list %u "
+ "instead of list %u for %.*s\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ j, ml->list, res->lockname.len,
+ res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ spin_unlock(&res->spinlock);
+ continue;
+ }
/* see NOTE above about why we do not update
* to match the master here */
@@ -1710,6 +1874,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, queue);
spin_unlock(&res->spinlock);
+ added++;
mlog(0, "just reordered a local lock!\n");
continue;
@@ -1735,6 +1900,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
if (ml->type == LKM_NLMODE)
goto skip_lvb;
+ /*
+ * If the lock is in the blocked list it can't have a valid lvb,
+ * so skip it
+ */
+ if (ml->list == DLM_BLOCKED_LIST)
+ goto skip_lvb;
+
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
@@ -1747,7 +1919,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
* the lvb. */
memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
} else {
- /* otherwise, the node is sending its
+ /* otherwise, the node is sending its
* most recent valid lvb info */
BUG_ON(ml->type != LKM_EXMODE &&
ml->type != LKM_PRMODE);
@@ -1794,18 +1966,18 @@ skip_lvb:
spin_lock(&res->spinlock);
list_for_each_entry(lock, queue, list) {
if (lock->ml.cookie == ml->cookie) {
- u64 c = lock->ml.cookie;
+ c = lock->ml.cookie;
mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
"exists on this lockres!\n", dlm->name,
res->lockname.len, res->lockname.name,
- dlm_get_lock_cookie_node(c),
- dlm_get_lock_cookie_seq(c));
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)));
mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
"node=%u, cookie=%u:%llu, queue=%d\n",
ml->type, ml->convert_type, ml->node,
- dlm_get_lock_cookie_node(ml->cookie),
- dlm_get_lock_cookie_seq(ml->cookie),
+ dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
ml->list);
__dlm_print_one_lock_resource(res);
@@ -1815,20 +1987,37 @@ skip_lvb:
}
if (!bad) {
dlm_lock_get(newlock);
- list_add_tail(&newlock->list, queue);
+ if (mres->flags & DLM_MRES_RECOVERY &&
+ ml->list == DLM_CONVERTING_LIST &&
+ newlock->ml.type >
+ newlock->ml.convert_type) {
+ /* newlock is doing downconvert, add it to the
+ * head of converting list */
+ list_add(&newlock->list, queue);
+ } else
+ list_add_tail(&newlock->list, queue);
+ mlog(0, "%s:%.*s: added lock for node %u, "
+ "setting refmap bit\n", dlm->name,
+ res->lockname.len, res->lockname.name, ml->node);
+ dlm_lockres_set_refmap_bit(dlm, res, ml->node);
+ added++;
}
spin_unlock(&res->spinlock);
}
mlog(0, "done running all the locks\n");
leave:
+ /* balance the ref taken when the work was queued */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
if (ret < 0) {
mlog_errno(ret);
if (newlock)
dlm_lock_put(newlock);
}
- mlog_exit(ret);
return ret;
}
@@ -1836,15 +2025,18 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int i;
- struct list_head *queue, *iter, *iter2;
- struct dlm_lock *lock;
+ struct list_head *queue;
+ struct dlm_lock *lock, *next;
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
mlog(0,
"Recovering res %s:%.*s, is already on recovery list!\n",
dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
/* We need to hold a reference while on the recovery list */
dlm_lockres_get(res);
@@ -1853,8 +2045,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
/* find any pending locks and put them back on proper list */
for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
queue = dlm_list_idx_to_ptr(res, i);
- list_for_each_safe(iter, iter2, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, queue, list) {
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
@@ -1919,24 +2110,23 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
u8 dead_node, u8 new_master)
{
int i;
- struct list_head *iter, *iter2;
- struct hlist_node *hash_iter;
struct hlist_head *bucket;
-
- struct dlm_lock_resource *res;
-
- mlog_entry_void();
+ struct dlm_lock_resource *res, *next;
assert_spin_locked(&dlm->spinlock);
- list_for_each_safe(iter, iter2, &dlm->reco.resources) {
- res = list_entry (iter, struct dlm_lock_resource, recovering);
+ list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
+ /* new_master has our reference from
+ * the lock state sent during recovery */
dlm_change_lockres_owner(dlm, res, new_master);
res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (!__dlm_lockres_unused(res))
+ if (__dlm_lockres_has_locks(res))
__dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -1950,39 +2140,31 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
* if necessary */
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
- if (res->state & DLM_LOCK_RES_RECOVERING) {
- if (res->owner == dead_node) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, but "
- "clearing state anyway\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else if (res->owner == dlm->node_num) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, "
- "owner is THIS node, clearing\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else
- continue;
+ hlist_for_each_entry(res, bucket, hash_node) {
+ if (!(res->state & DLM_LOCK_RES_RECOVERING))
+ continue;
- if (!list_empty(&res->recovering)) {
- mlog(0, "%s:%.*s: lockres was "
- "marked RECOVERING, owner=%u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
- list_del_init(&res->recovering);
- dlm_lockres_put(res);
- }
- spin_lock(&res->spinlock);
- dlm_change_lockres_owner(dlm, res, new_master);
- res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (!__dlm_lockres_unused(res))
- __dlm_dirty_lockres(dlm, res);
- spin_unlock(&res->spinlock);
- wake_up(&res->wq);
+ if (res->owner != dead_node &&
+ res->owner != dlm->node_num)
+ continue;
+
+ if (!list_empty(&res->recovering)) {
+ list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
+
+ /* new_master has our reference from
+ * the lock state sent during recovery */
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
+ spin_lock(&res->spinlock);
+ dlm_change_lockres_owner(dlm, res, new_master);
+ res->state &= ~DLM_LOCK_RES_RECOVERING;
+ if (__dlm_lockres_has_locks(res))
+ __dlm_dirty_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
}
}
}
@@ -2001,7 +2183,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, u8 dead_node)
{
- struct list_head *iter, *queue;
+ struct list_head *queue;
struct dlm_lock *lock;
int blank_lvb = 0, local = 0;
int i;
@@ -2011,7 +2193,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
if (res->owner == dlm->node_num)
- /* if this node owned the lockres, and if the dead node
+ /* if this node owned the lockres, and if the dead node
* had an EX when he died, blank out the lvb */
search_node = dead_node;
else {
@@ -2023,8 +2205,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
queue = dlm_list_idx_to_ptr(res, i);
- list_for_each(iter, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, queue, list) {
if (lock->ml.node == search_node) {
if (dlm_lvb_needs_invalidation(lock, local)) {
/* zero the lksb lvb and lockres lvb */
@@ -2045,39 +2226,66 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, u8 dead_node)
{
- struct list_head *iter, *tmpiter;
- struct dlm_lock *lock;
+ struct dlm_lock *lock, *next;
+ unsigned int freed = 0;
/* this node is the lockres master:
* 1) remove any stale locks for the dead node
- * 2) if the dead node had an EX when he died, blank out the lvb
+ * 2) if the dead node had an EX when he died, blank out the lvb
*/
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
+ /* We do two dlm_lock_put(). One for removing from list and the other is
+ * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
+
/* TODO: check pending_asts, pending_basts here */
- list_for_each_safe(iter, tmpiter, &res->granted) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, &res->granted, list) {
if (lock->ml.node == dead_node) {
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+ dlm_lock_put(lock);
+ freed++;
}
}
- list_for_each_safe(iter, tmpiter, &res->converting) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, &res->converting, list) {
if (lock->ml.node == dead_node) {
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+ dlm_lock_put(lock);
+ freed++;
}
}
- list_for_each_safe(iter, tmpiter, &res->blocked) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, &res->blocked, list) {
if (lock->ml.node == dead_node) {
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
+ dlm_lock_put(lock);
+ freed++;
}
}
+ if (freed) {
+ mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
+ "dropping ref from lockres\n", dlm->name,
+ res->lockname.len, res->lockname.name, freed, dead_node);
+ if(!test_bit(dead_node, res->refmap)) {
+ mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
+ "but ref was not set\n", dlm->name,
+ res->lockname.len, res->lockname.name, freed, dead_node);
+ __dlm_print_one_lock_resource(res);
+ }
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+ } else if (test_bit(dead_node, res->refmap)) {
+ mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+ "no locks and had not purged before dying\n", dlm->name,
+ res->lockname.len, res->lockname.name, dead_node);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+ }
+
/* do not kick thread yet */
__dlm_dirty_lockres(dlm, res);
}
@@ -2091,7 +2299,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct hlist_node *iter;
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
@@ -2117,7 +2324,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, iter, bucket, hash_node) {
+ hlist_for_each_entry(res, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2136,15 +2343,31 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
}
spin_unlock(&res->spinlock);
continue;
- }
+ }
spin_lock(&res->spinlock);
/* zero the lvb if necessary */
dlm_revalidate_lvb(dlm, res, dead_node);
- if (res->owner == dead_node)
- dlm_move_lockres_to_recovery_list(dlm, res);
- else if (res->owner == dlm->node_num) {
+ if (res->owner == dead_node) {
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s, Skip "
+ "recovery as it is being freed\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name);
+ } else
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
+
+ } else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
+ } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ if (test_bit(dead_node, res->refmap)) {
+ mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+ "no locks and had not purged before dying\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+ }
}
spin_unlock(&res->spinlock);
}
@@ -2170,6 +2393,12 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
}
}
+ /* Clean up join state on node death. */
+ if (dlm->joining_node == idx) {
+ mlog(0, "Clearing join state for node %u\n", idx);
+ __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
+ }
+
/* check to see if the node is already considered dead */
if (!test_bit(idx, dlm->live_nodes_map)) {
mlog(0, "for domain %s, node %d is already dead. "
@@ -2188,12 +2417,6 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
clear_bit(idx, dlm->live_nodes_map);
- /* Clean up join state on node death. */
- if (dlm->joining_node == idx) {
- mlog(0, "Clearing join state for node %u\n", idx);
- __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- }
-
/* make sure local cleanup occurs before the heartbeat events */
if (!test_bit(idx, dlm->recovery_map))
dlm_do_local_recovery_cleanup(dlm, idx);
@@ -2203,6 +2426,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
mlog(0, "node %u being removed from domain map!\n", idx);
clear_bit(idx, dlm->domain_map);
+ clear_bit(idx, dlm->exit_domain_map);
/* wake up migration waiters if a node goes down.
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
@@ -2221,6 +2445,13 @@ void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
if (!dlm_grab(dlm))
return;
+ /*
+ * This will notify any dlm users that a node in our domain
+ * went away without notifying us first.
+ */
+ if (test_bit(idx, dlm->domain_map))
+ dlm_fire_domain_eviction_callbacks(dlm, idx);
+
spin_lock(&dlm->spinlock);
__dlm_hb_node_down(dlm, idx);
spin_unlock(&dlm->spinlock);
@@ -2268,7 +2499,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
* this function on each node racing to become the recovery
* master will not stop attempting this until either:
* a) this node gets the EX (and becomes the recovery master),
- * or b) dlm->reco.new_master gets set to some nodenum
+ * or b) dlm->reco.new_master gets set to some nodenum
* != O2NM_INVALID_NODE_NUM (another node will do the reco).
* so each time a recovery master is needed, the entire cluster
* will sync at this point. if the new master dies, that will
@@ -2281,7 +2512,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
-again:
+again:
memset(&lksb, 0, sizeof(lksb));
ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2294,8 +2525,8 @@ again:
if (ret == DLM_NORMAL) {
mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
dlm->name, dlm->node_num);
-
- /* got the EX lock. check to see if another node
+
+ /* got the EX lock. check to see if another node
* just became the reco master */
if (dlm_reco_master_ready(dlm)) {
mlog(0, "%s: got reco EX lock, but %u will "
@@ -2308,12 +2539,12 @@ again:
/* see if recovery was already finished elsewhere */
spin_lock(&dlm->spinlock);
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
- status = -EINVAL;
+ status = -EINVAL;
mlog(0, "%s: got reco EX lock, but "
"node got recovered already\n", dlm->name);
if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
mlog(ML_ERROR, "%s: new master is %u "
- "but no dead node!\n",
+ "but no dead node!\n",
dlm->name, dlm->reco.new_master);
BUG();
}
@@ -2325,7 +2556,7 @@ again:
* set the master and send the messages to begin recovery */
if (!status) {
mlog(0, "%s: dead=%u, this=%u, sending "
- "begin_reco now\n", dlm->name,
+ "begin_reco now\n", dlm->name,
dlm->reco.dead_node, dlm->node_num);
status = dlm_send_begin_reco_message(dlm,
dlm->reco.dead_node);
@@ -2358,7 +2589,7 @@ again:
mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
dlm->name, dlm->node_num);
/* another node is master. wait on
- * reco.new_master != O2NM_INVALID_NODE_NUM
+ * reco.new_master != O2NM_INVALID_NODE_NUM
* for at most one second */
wait_event_timeout(dlm->dlm_reco_thread_wq,
dlm_reco_master_ready(dlm),
@@ -2405,8 +2636,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
int nodenum;
int status;
- mlog_entry("%u\n", dead_node);
-
mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
spin_lock(&dlm->spinlock);
@@ -2442,17 +2671,32 @@ retry:
if (dlm_is_host_down(ret)) {
/* node is down. not involved in recovery
* so just keep going */
- mlog(0, "%s: node %u was down when sending "
+ mlog(ML_NOTICE, "%s: node %u was down when sending "
"begin reco msg (%d)\n", dlm->name, nodenum, ret);
ret = 0;
}
+
+ /*
+ * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
+ * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
+ * We are handling both for compatibility reasons.
+ */
+ if (ret == -EAGAIN || ret == EAGAIN) {
+ mlog(0, "%s: trying to start recovery of node "
+ "%u, but node %u is waiting for last recovery "
+ "to complete, backoff for a bit\n", dlm->name,
+ dead_node, nodenum);
+ msleep(100);
+ goto retry;
+ }
if (ret < 0) {
struct dlm_lock_resource *res;
- /* this is now a serious problem, possibly ENOMEM
+
+ /* this is now a serious problem, possibly ENOMEM
* in the network stack. must retry */
mlog_errno(ret);
mlog(ML_ERROR, "begin reco of dlm %s to node %u "
- " returned %d\n", dlm->name, nodenum, ret);
+ "returned %d\n", dlm->name, nodenum, ret);
res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
DLM_RECOVERY_LOCK_NAME_LEN);
if (res) {
@@ -2461,25 +2705,18 @@ retry:
} else {
mlog(ML_ERROR, "recovery lock not found\n");
}
- /* sleep for a bit in hopes that we can avoid
+ /* sleep for a bit in hopes that we can avoid
* another ENOMEM */
msleep(100);
goto retry;
- } else if (ret == EAGAIN) {
- mlog(0, "%s: trying to start recovery of node "
- "%u, but node %u is waiting for last recovery "
- "to complete, backoff for a bit\n", dlm->name,
- dead_node, nodenum);
- /* TODO Look into replacing msleep with cond_resched() */
- msleep(100);
- goto retry;
}
}
return ret;
}
-int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
@@ -2495,7 +2732,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
dlm->name, br->node_idx, br->dead_node,
dlm->reco.dead_node, dlm->reco.new_master);
spin_unlock(&dlm->spinlock);
- return EAGAIN;
+ dlm_put(dlm);
+ return -EAGAIN;
}
spin_unlock(&dlm->spinlock);
@@ -2520,7 +2758,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
}
if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
- "node %u changing it to %u\n", dlm->name,
+ "node %u changing it to %u\n", dlm->name,
dlm->reco.dead_node, br->node_idx, br->dead_node);
}
dlm_set_reco_master(dlm, br->node_idx);
@@ -2584,10 +2822,12 @@ stage2:
if (ret >= 0)
ret = status;
if (ret < 0) {
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+ dlm->key, nodenum);
if (dlm_is_host_down(ret)) {
- /* this has no effect on this recovery
- * session, so set the status to zero to
+ /* this has no effect on this recovery
+ * session, so set the status to zero to
* finish out the last recovery */
mlog(ML_ERROR, "node %u went down after this "
"node finished recovery.\n", nodenum);
@@ -2607,7 +2847,8 @@ stage2:
return ret;
}
-int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
@@ -2623,7 +2864,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
mlog(0, "%s: node %u finalizing recovery stage%d of "
"node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
-
+
spin_lock(&dlm->spinlock);
if (dlm->reco.new_master != fr->node_idx) {
@@ -2663,8 +2904,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
BUG();
}
dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
- dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
default:
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 0c822f3ffb0..69aac6f088a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -54,9 +52,6 @@
#include "cluster/masklog.h"
static int dlm_thread(void *data);
-static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *lockres);
-
static void dlm_flush_asts(struct dlm_ctxt *dlm);
#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
@@ -79,18 +74,47 @@ repeat:
goto repeat;
}
remove_wait_queue(&res->wq, &wait);
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
}
-
-int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
{
if (list_empty(&res->granted) &&
list_empty(&res->converting) &&
- list_empty(&res->blocked) &&
- list_empty(&res->dirty))
- return 1;
- return 0;
+ list_empty(&res->blocked))
+ return 0;
+ return 1;
+}
+
+/* "unused": the lockres has no locks, is not on the dirty list,
+ * has no inflight locks (in the gap between mastery and acquiring
+ * the first lock), and has no bits in its refmap.
+ * truly ready to be freed. */
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
+{
+ int bit;
+
+ assert_spin_locked(&res->spinlock);
+
+ if (__dlm_lockres_has_locks(res))
+ return 0;
+
+ /* Locks are in the process of being created */
+ if (res->inflight_locks)
+ return 0;
+
+ if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+ return 0;
+
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
+ /* Another node has this resource with this node as the master */
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (bit < O2NM_MAX_NODES)
+ return 0;
+
+ return 1;
}
@@ -100,52 +124,25 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
- /* For now, just keep any resource we master */
- if (res->owner == dlm->node_num)
- {
- if (!list_empty(&res->purge)) {
- mlog(0, "we master %s:%.*s, but it is on "
- "the purge list. Removing\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- list_del_init(&res->purge);
- dlm->purge_count--;
- }
- return;
- }
-
if (list_empty(&res->purge)) {
- mlog(0, "putting lockres %.*s from purge list\n",
- res->lockname.len, res->lockname.name);
+ mlog(0, "%s: Adding res %.*s to purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
res->last_used = jiffies;
+ dlm_lockres_get(res);
list_add_tail(&res->purge, &dlm->purge_list);
dlm->purge_count++;
-
- /* if this node is not the owner, there is
- * no way to keep track of who the owner could be.
- * unhash it to avoid serious problems. */
- if (res->owner != dlm->node_num) {
- mlog(0, "%s:%.*s: doing immediate "
- "purge of lockres owned by %u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
-
- dlm_purge_lockres_now(dlm, res);
- }
}
} else if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s from purge list, "
- "owner=%u\n", res->lockname.len, res->lockname.name,
- res->owner);
+ mlog(0, "%s: Removing res %.*s from purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->purge);
+ dlm_lockres_put(res);
dlm->purge_count--;
}
}
@@ -153,7 +150,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -163,68 +159,66 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
-/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
- * to do migration, but will re-acquire before exit. */
-void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
int master;
- int ret;
+ int ret = 0;
- spin_lock(&lockres->spinlock);
- master = lockres->owner == dlm->node_num;
- spin_unlock(&lockres->spinlock);
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
- mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
- lockres->lockname.name, master);
+ master = (res->owner == dlm->node_num);
- /* Non master is the easy case -- no migration required, just
- * quit. */
- if (!master)
- goto finish;
+ mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, master);
- /* Wheee! Migrate lockres here! */
- spin_unlock(&dlm->spinlock);
-again:
+ if (!master) {
+ res->state |= DLM_LOCK_RES_DROPPING_REF;
+ /* drop spinlock... retake below */
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
- ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
- if (ret == -ENOTEMPTY) {
- mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
- lockres->lockname.len, lockres->lockname.name);
+ spin_lock(&res->spinlock);
+ /* This ensures that clear refmap is sent after the set */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+ spin_unlock(&res->spinlock);
- BUG();
- } else if (ret < 0) {
- mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
- lockres->lockname.len, lockres->lockname.name);
- msleep(100);
- goto again;
+ /* clear our bit from the master's refmap, ignore errors */
+ ret = dlm_drop_lockres_ref(dlm, res);
+ if (ret < 0) {
+ if (!dlm_is_host_down(ret))
+ BUG();
+ }
+ spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
}
- spin_lock(&dlm->spinlock);
-
-finish:
- if (!list_empty(&lockres->purge)) {
- list_del_init(&lockres->purge);
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+ dlm->name, res->lockname.len, res->lockname.name, master);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
dlm->purge_count--;
}
- __dlm_unhash_lockres(lockres);
-}
-/* make an unused lockres go away immediately.
- * as soon as the dlm spinlock is dropped, this lockres
- * will not be found. kfree still happens on last put. */
-static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *lockres)
-{
- assert_spin_locked(&dlm->spinlock);
- assert_spin_locked(&lockres->spinlock);
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
- BUG_ON(!__dlm_lockres_unused(lockres));
+ __dlm_unhash_lockres(dlm, res);
- if (!list_empty(&lockres->purge)) {
- list_del_init(&lockres->purge);
- dlm->purge_count--;
- }
- __dlm_unhash_lockres(lockres);
+ /* lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource. */
+ if (!master) {
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ } else
+ spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -243,17 +237,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
- /* Status of the lockres *might* change so double
- * check. If the lockres is unused, holding the dlm
- * spinlock will prevent people from getting and more
- * refs on it -- there's no need to keep the lockres
- * spinlock. */
spin_lock(&lockres->spinlock);
- unused = __dlm_lockres_unused(lockres);
- spin_unlock(&lockres->spinlock);
-
- if (!unused)
- continue;
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -265,17 +249,34 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
+ spin_unlock(&lockres->spinlock);
break;
}
- list_del_init(&lockres->purge);
- dlm->purge_count--;
+ /* Status of the lockres *might* change so double
+ * check. If the lockres is unused, holding the dlm
+ * spinlock will prevent people from getting and more
+ * refs on it. */
+ unused = __dlm_lockres_unused(lockres);
+ if (!unused ||
+ (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+ (lockres->inflight_assert_workers != 0)) {
+ mlog(0, "%s: res %.*s is in use or being remastered, "
+ "used %d, state %d, assert master workers %u\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name,
+ !unused, lockres->state,
+ lockres->inflight_assert_workers);
+ list_move_tail(&lockres->purge, &dlm->purge_list);
+ spin_unlock(&lockres->spinlock);
+ continue;
+ }
+
+ dlm_lockres_get(lockres);
- /* This may drop and reacquire the dlm spinlock if it
- * has to do migration. */
- mlog(0, "calling dlm_purge_lockres!\n");
dlm_purge_lockres(dlm, lockres);
- mlog(0, "DONE calling dlm_purge_lockres!\n");
+
+ dlm_lockres_put(lockres);
/* Avoid adding any scheduling latencies */
cond_resched_lock(&dlm->spinlock);
@@ -288,19 +289,15 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_lock *lock, *target;
- struct list_head *iter;
- struct list_head *head;
int can_grant = 1;
- //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
- //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
- //mlog(0, "shuffle res %.*s\n", res->lockname.len,
- // res->lockname.name);
-
- /* because this function is called with the lockres
+ /*
+ * Because this function is called with the lockres
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
- * basts right before queueing them all throughout */
+ * basts right before queueing them all throughout
+ */
+ assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
DLM_LOCK_RES_RECOVERING|
@@ -309,18 +306,16 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
converting:
if (list_empty(&res->converting))
goto blocked;
- mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
- res->lockname.name);
+ mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+ res->lockname.len, res->lockname.name);
target = list_entry(res->converting.next, struct dlm_lock, list);
if (target->ml.convert_type == LKM_IVMODE) {
- mlog(ML_ERROR, "%.*s: converting a lock with no "
- "convert_type!\n", res->lockname.len, res->lockname.name);
+ mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+ dlm->name, res->lockname.len, res->lockname.name);
BUG();
}
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -329,7 +324,7 @@ converting:
/* queue the BAST if not already */
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
/* update the highest_blocked if needed */
if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -337,9 +332,8 @@ converting:
target->ml.convert_type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -347,7 +341,7 @@ converting:
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
@@ -360,9 +354,12 @@ converting:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for converting lock: %.*s, have: %d, "
- "granting: %d, node: %u\n", res->lockname.len,
- res->lockname.name, target->ml.type,
+ mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+ "%d => %d, node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+ target->ml.type,
target->ml.convert_type, target->ml.node);
target->ml.type = target->ml.convert_type;
@@ -375,7 +372,7 @@ converting:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -385,32 +382,28 @@ blocked:
goto leave;
target = list_entry(res->blocked.next, struct dlm_lock, list);
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
@@ -423,11 +416,14 @@ blocked:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
- "node: %u\n", res->lockname.len, res->lockname.name,
+ mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
target->ml.type, target->ml.node);
- // target->ml.type is already correct
+ /* target->ml.type is already correct */
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
@@ -436,7 +432,7 @@ blocked:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -448,7 +444,6 @@ leave:
/* must have NO locks when calling this with res !=NULL * */
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
if (res) {
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -461,26 +456,32 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
/* don't shuffle secondary queues */
- if ((res->owner == dlm->node_num) &&
- !(res->state & DLM_LOCK_RES_DIRTY)) {
- /* ref for dirty_list */
- dlm_lockres_get(res);
- list_add_tail(&res->dirty, &dlm->dirty_list);
- res->state |= DLM_LOCK_RES_DIRTY;
+ if ((res->owner == dlm->node_num)) {
+ if (res->state & (DLM_LOCK_RES_MIGRATING |
+ DLM_LOCK_RES_BLOCK_DIRTY))
+ return;
+
+ if (list_empty(&res->dirty)) {
+ /* ref for dirty_list */
+ dlm_lockres_get(res);
+ list_add_tail(&res->dirty, &dlm->dirty_list);
+ res->state |= DLM_LOCK_RES_DIRTY;
+ }
}
+
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
/* Launch the NM thread for the mounted volume */
int dlm_launch_thread(struct dlm_ctxt *dlm)
{
- mlog(0, "starting dlm thread...\n");
+ mlog(0, "Starting dlm_thread...\n");
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
if (IS_ERR(dlm->dlm_thread_task)) {
@@ -495,7 +496,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
void dlm_complete_thread(struct dlm_ctxt *dlm)
{
if (dlm->dlm_thread_task) {
- mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+ mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
kthread_stop(dlm->dlm_thread_task);
dlm->dlm_thread_task = NULL;
}
@@ -526,7 +527,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
- mlog(0, "delivering an ast for this lockres\n");
+ mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.node);
BUG_ON(!lock->ast_pending);
@@ -547,9 +553,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another ast was queued while
* we were delivering the last one */
if (!list_empty(&lock->ast_list)) {
- mlog(0, "aha another ast got queued while "
- "we were finishing the last one. will "
- "keep the ast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, AST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->ast_pending = 0;
@@ -580,8 +586,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
- mlog(0, "delivering a bast for this lockres "
- "(blocked = %d\n", hi);
+ mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+ "blocked %d, node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ hi, lock->ml.node);
if (lock->ml.node != dlm->node_num) {
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -595,9 +605,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another bast was queued while
* we were delivering the last one */
if (!list_empty(&lock->bast_list)) {
- mlog(0, "aha another bast got queued while "
- "we were finishing the last one. will "
- "keep the bast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, BAST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->bast_pending = 0;
@@ -651,7 +661,7 @@ static int dlm_thread(void *data)
dlm_lockres_get(res);
spin_lock(&res->spinlock);
- res->state &= ~DLM_LOCK_RES_DIRTY;
+ /* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
list_del_init(&res->dirty);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
@@ -661,27 +671,31 @@ static int dlm_thread(void *data)
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
+ spin_lock(&dlm->ast_lock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
- mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
- res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
- res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
- res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
- res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+ mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+ " dirty %d\n", dlm->name,
+ !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+ !!(res->state & DLM_LOCK_RES_MIGRATING),
+ !!(res->state & DLM_LOCK_RES_RECOVERING),
+ !!(res->state & DLM_LOCK_RES_DIRTY));
}
BUG_ON(res->owner != dlm->node_num);
/* it is now ok to move lockreses in these states
* to the dirty list, assuming that they will only be
* dirty for a short while. */
+ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
- DLM_LOCK_RES_MIGRATING |
DLM_LOCK_RES_RECOVERING)) {
/* move it to the tail and keep going */
+ res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
- mlog(0, "delaying list shuffling for in-"
- "progress lockres %.*s, state=%d\n",
+ spin_unlock(&dlm->ast_lock);
+ mlog(0, "%s: res %.*s, inprogress, delay list "
+ "shuffle, state %d\n", dlm->name,
res->lockname.len, res->lockname.name,
res->state);
delay = 1;
@@ -693,13 +707,11 @@ static int dlm_thread(void *data)
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
- mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
- "res=%.*s\n", dlm->name,
- res->lockname.len, res->lockname.name);
-
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
+ res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->ast_lock);
dlm_lockres_calc_usage(dlm, res);
@@ -709,11 +721,8 @@ in_progress:
/* if the lock was in-progress, stick
* it on the back of the list */
if (delay) {
- /* ref for dirty_list */
- dlm_lockres_get(res);
spin_lock(&res->spinlock);
- list_add_tail(&res->dirty, &dlm->dirty_list);
- res->state |= DLM_LOCK_RES_DIRTY;
+ __dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
}
dlm_lockres_put(res);
@@ -721,7 +730,8 @@ in_progress:
/* unlikely, but we may need to give time to
* other tasks */
if (!--n) {
- mlog(0, "throttling dlm_thread\n");
+ mlog(0, "%s: Throttling dlm thread\n",
+ dlm->name);
break;
}
}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 37be4b2e0d4..2e3c9dbab68 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -117,12 +115,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
else
BUG_ON(res->owner == dlm->node_num);
- spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->ast_lock);
/* We want to be sure that we're not freeing a lock
* that still has AST's pending... */
in_use = !list_empty(&lock->ast_list);
- spin_unlock(&dlm->spinlock);
- if (in_use) {
+ spin_unlock(&dlm->ast_lock);
+ if (in_use && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
"while waiting for an ast!", res->lockname.len,
res->lockname.name);
@@ -131,7 +129,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
- if (master_node) {
+ if (master_node && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres in progress!\n");
spin_unlock(&res->spinlock);
return DLM_FORWARD;
@@ -147,6 +145,10 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
goto leave;
}
+ if (res->state & DLM_LOCK_RES_MIGRATING) {
+ status = DLM_MIGRATING;
+ goto leave;
+ }
/* see above for what the spec says about
* LKM_CANCEL and the lock queue state */
@@ -187,9 +189,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
DLM_UNLOCK_REGRANT_LOCK|
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
- } else if (status == DLM_RECOVERING ||
- status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ } else if (status == DLM_RECOVERING ||
+ status == DLM_MIGRATING ||
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR
+ ) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
@@ -198,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
- "forward"));
+ (status == DLM_FORWARD ? "forward" :
+ "nolockmanager")));
actions = 0;
}
if (flags & LKM_CANCEL)
@@ -244,8 +249,8 @@ leave:
/* this should always be coupled with list removal */
BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
- dlm_get_lock_cookie_node(lock->ml.cookie),
- dlm_get_lock_cookie_seq(lock->ml.cookie),
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
atomic_read(&lock->lock_refs.refcount)-1);
dlm_lock_put(lock);
}
@@ -315,7 +320,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct kvec vec[2];
size_t veclen = 1;
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
if (owner == dlm->node_num) {
/* ended up trying to contact ourself. this means
@@ -352,7 +357,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
mlog(0, "master was in-progress. retry\n");
ret = status;
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
if (dlm_is_host_down(tmpret)) {
/* NOTE: this seems strange, but it is what we want.
* when the master goes down during a cancel or
@@ -361,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
- ret = DLM_NORMAL;
+ if (dlm_is_node_dead(dlm, owner))
+ ret = DLM_NORMAL;
+ else
+ ret = DLM_NOLOCKMGR;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
@@ -379,12 +388,12 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
* returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
* return value from dlmunlock_master
*/
-int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
+int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
enum dlm_status status = DLM_NORMAL;
int found = 0, i;
@@ -454,8 +463,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
}
for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, queue, list) {
if (lock->ml.cookie == unlock->cookie &&
lock->ml.node == unlock->node_idx) {
dlm_lock_get(lock);
@@ -502,8 +510,8 @@ not_found:
if (!found)
mlog(ML_ERROR, "failed to find lock to unlock! "
"cookie=%u:%llu\n",
- dlm_get_lock_cookie_node(unlock->cookie),
- dlm_get_lock_cookie_seq(unlock->cookie));
+ dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
else
dlm_lock_put(lock);
@@ -584,8 +592,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
struct dlm_lock *lock = NULL;
int call_ast, is_master;
- mlog_entry_void();
-
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
@@ -638,7 +644,9 @@ retry:
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR) {
+
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
@@ -650,21 +658,21 @@ retry:
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
- "migration/in-progress\n");
+ "migration/in-progress/reconnect\n");
goto retry;
}
if (call_ast) {
mlog(0, "calling unlockast(%p, %d)\n", data, status);
if (is_master) {
- /* it is possible that there is one last bast
+ /* it is possible that there is one last bast
* pending. make sure it is flushed, then
* call the unlockast.
* not an issue if this is a mastered remotely,
* since this lock has been removed from the
* lockres queues and cannot be found. */
dlm_kick_thread(dlm, NULL);
- wait_event(dlm->ast_wq,
+ wait_event(dlm->ast_wq,
dlm_lock_basts_flushed(dlm, lock));
}
(*unlockast)(data, status);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index 7ef2653f8f4..00000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.3.3"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a1..00000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 00000000000..eed3db8c5b4
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 0368c640218..09b7d9dac71 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -42,34 +42,68 @@
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
+#include <linux/poll.h>
#include <asm/uaccess.h>
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
+#include "stackglue.h"
#include "userdlm.h"
-#include "dlmfsver.h"
-
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
-static struct super_operations dlmfs_ops;
-static struct file_operations dlmfs_file_operations;
-static struct inode_operations dlmfs_dir_inode_operations;
-static struct inode_operations dlmfs_root_inode_operations;
-static struct inode_operations dlmfs_file_inode_operations;
-static kmem_cache_t *dlmfs_inode_cache;
+
+static const struct super_operations dlmfs_ops;
+static const struct file_operations dlmfs_file_operations;
+static const struct inode_operations dlmfs_dir_inode_operations;
+static const struct inode_operations dlmfs_root_inode_operations;
+static const struct inode_operations dlmfs_file_inode_operations;
+static struct kmem_cache *dlmfs_inode_cache;
struct workqueue_struct *user_dlm_worker;
+
+
+/*
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI. Unfortunately, some of these features are not detectable
+ * via standard usage. For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support. Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute. We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
+ *
+ * The ABI features are local to this machine's dlmfs mount. This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast : POLLIN against the file descriptor of a held lock
+ * signifies a bast fired on the lock.
+ */
+#define DLMFS_CAPABILITIES "bast stackglue"
+static int param_set_dlmfs_capabilities(const char *val,
+ struct kernel_param *kp)
+{
+ printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+ return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+ struct kernel_param *kp)
+{
+ return strlcpy(buffer, DLMFS_CAPABILITIES,
+ strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+ param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
+
/*
* decodes a set of open flags into a valid lock level and a set of flags.
* returns < 0 if we have invalid flags
@@ -77,20 +111,20 @@ struct workqueue_struct *user_dlm_worker;
* O_RDONLY -> PRMODE level
* O_WRONLY -> EXMODE level
*
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
*/
static int dlmfs_decode_open_flags(int open_flags,
int *level,
int *flags)
{
if (open_flags & (O_WRONLY|O_RDWR))
- *level = LKM_EXMODE;
+ *level = DLM_LOCK_EX;
else
- *level = LKM_PRMODE;
+ *level = DLM_LOCK_PR;
*flags = 0;
if (open_flags & O_NONBLOCK)
- *flags |= LKM_NOQUEUE;
+ *flags |= DLM_LKF_NOQUEUE;
return 0;
}
@@ -131,7 +165,7 @@ static int dlmfs_file_open(struct inode *inode,
* to be able userspace to be able to distinguish a
* valid lock request from one that simply couldn't be
* granted. */
- if (flags & LKM_NOQUEUE && status == -EAGAIN)
+ if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
status = -ETXTBSY;
kfree(fp);
goto bail;
@@ -147,8 +181,7 @@ static int dlmfs_file_release(struct inode *inode,
{
int level, status;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
- struct dlmfs_filp_private *fp =
- (struct dlmfs_filp_private *) file->private_data;
+ struct dlmfs_filp_private *fp = file->private_data;
if (S_ISDIR(inode->i_mode))
BUG();
@@ -158,7 +191,7 @@ static int dlmfs_file_release(struct inode *inode,
status = 0;
if (fp) {
level = fp->fp_lock_level;
- if (level != LKM_IVMODE)
+ if (level != DLM_LOCK_IV)
user_dlm_cluster_unlock(&ip->ip_lockres, level);
kfree(fp);
@@ -168,15 +201,50 @@ static int dlmfs_file_release(struct inode *inode,
return 0;
}
+/*
+ * We do ->setattr() just to override size changes. Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int error;
+ struct inode *inode = dentry->d_inode;
+
+ attr->ia_valid &= ~ATTR_SIZE;
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+ int event = 0;
+ struct inode *inode = file_inode(file);
+ struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+ poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+ spin_lock(&ip->ip_lockres.l_lock);
+ if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+ event = POLLIN | POLLRDNORM;
+ spin_unlock(&ip->ip_lockres.l_lock);
+
+ return event;
+}
+
static ssize_t dlmfs_file_read(struct file *filp,
char __user *buf,
size_t count,
loff_t *ppos)
{
int bytes_left;
- ssize_t readlen;
+ ssize_t readlen, got;
char *lvb_buf;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -194,15 +262,19 @@ static ssize_t dlmfs_file_read(struct file *filp,
if ((count + *ppos) > i_size_read(inode))
readlen = i_size_read(inode) - *ppos;
else
- readlen = count - *ppos;
+ readlen = count;
lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
- user_dlm_read_lvb(inode, lvb_buf, readlen);
- bytes_left = __copy_to_user(buf, lvb_buf, readlen);
- readlen -= bytes_left;
+ got = user_dlm_read_lvb(inode, lvb_buf, readlen);
+ if (got) {
+ BUG_ON(got != readlen);
+ bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+ readlen -= bytes_left;
+ } else
+ readlen = 0;
kfree(lvb_buf);
@@ -220,7 +292,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
int bytes_left;
ssize_t writelen;
char *lvb_buf;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -256,45 +328,45 @@ static ssize_t dlmfs_file_write(struct file *filp,
return writelen;
}
-static void dlmfs_init_once(void *foo,
- kmem_cache_t *cachep,
- unsigned long flags)
+static void dlmfs_init_once(void *foo)
{
struct dlmfs_inode_private *ip =
(struct dlmfs_inode_private *) foo;
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR) {
- ip->ip_dlm = NULL;
- ip->ip_parent = NULL;
+ ip->ip_conn = NULL;
+ ip->ip_parent = NULL;
- inode_init_once(&ip->ip_vfs_inode);
- }
+ inode_init_once(&ip->ip_vfs_inode);
}
static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
- ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
+ ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
return &ip->ip_vfs_inode;
}
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
-static void dlmfs_clear_inode(struct inode *inode)
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
+static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
- if (!inode)
- return;
+ clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
@@ -308,37 +380,33 @@ static void dlmfs_clear_inode(struct inode *inode)
goto clear_fields;
}
- mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+ mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
/* we must be a directory. If required, lets unregister the
* dlm context now. */
- if (ip->ip_dlm)
- user_dlm_unregister_context(ip->ip_dlm);
+ if (ip->ip_conn)
+ user_dlm_unregister(ip->ip_conn);
clear_fields:
ip->ip_parent = NULL;
- ip->ip_dlm = NULL;
+ ip->ip_conn = NULL;
}
static struct backing_dev_info dlmfs_backing_dev_info = {
+ .name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
- int mode = S_IFDIR | 0755;
- struct dlmfs_inode_private *ip;
+ umode_t mode = S_IFDIR | 0755;
if (inode) {
- ip = DLMFS_I(inode);
-
- inode->i_mode = mode;
- inode->i_uid = current->fsuid;
- inode->i_gid = current->fsgid;
- inode->i_blocks = 0;
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, NULL, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_nlink++;
+ inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
inode->i_op = &dlmfs_root_inode_operations;
@@ -349,7 +417,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
static struct inode *dlmfs_get_inode(struct inode *parent,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
struct super_block *sb = parent->i_sb;
struct inode * inode = new_inode(sb);
@@ -358,15 +426,13 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
if (!inode)
return NULL;
- inode->i_mode = mode;
- inode->i_uid = current->fsuid;
- inode->i_gid = current->fsgid;
- inode->i_blocks = 0;
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, parent, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ip = DLMFS_I(inode);
- ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+ ip->ip_conn = DLMFS_I(parent)->ip_conn;
switch (mode & S_IFMT) {
default:
@@ -395,16 +461,9 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
/* directory inodes start off with i_nlink ==
* 2 (for "." entry) */
- inode->i_nlink++;
+ inc_nlink(inode);
break;
}
-
- if (parent->i_mode & S_ISGID) {
- inode->i_gid = parent->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
-
return inode;
}
@@ -414,18 +473,18 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
/* SMP-safe */
static int dlmfs_mkdir(struct inode * dir,
struct dentry * dentry,
- int mode)
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
struct qstr *domain = &dentry->d_name;
struct dlmfs_inode_private *ip;
- struct dlm_ctxt *dlm;
+ struct ocfs2_cluster_connection *conn;
mlog(0, "mkdir %.*s\n", domain->len, domain->name);
/* verify that we have a proper domain */
- if (domain->len >= O2NM_MAX_NAME_LEN) {
+ if (domain->len >= GROUP_NAME_MAX) {
status = -EINVAL;
mlog(ML_ERROR, "invalid domain name for directory.\n");
goto bail;
@@ -440,16 +499,16 @@ static int dlmfs_mkdir(struct inode * dir,
ip = DLMFS_I(inode);
- dlm = user_dlm_register_context(domain);
- if (IS_ERR(dlm)) {
- status = PTR_ERR(dlm);
+ conn = user_dlm_register(domain);
+ if (IS_ERR(conn)) {
+ status = PTR_ERR(conn);
mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
status, domain->len, domain->name);
goto bail;
}
- ip->ip_dlm = dlm;
+ ip->ip_conn = conn;
- dir->i_nlink++;
+ inc_nlink(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
@@ -462,8 +521,8 @@ bail:
static int dlmfs_create(struct inode *dir,
struct dentry *dentry,
- int mode,
- struct nameidata *nd)
+ umode_t mode,
+ bool excl)
{
int status = 0;
struct inode *inode;
@@ -519,86 +578,84 @@ static int dlmfs_fill_super(struct super_block * sb,
void * data,
int silent)
{
- struct inode * inode;
- struct dentry * root;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
- inode = dlmfs_get_root_inode(sb);
- if (!inode)
+ sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
+ if (!sb->s_root)
return -ENOMEM;
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = root;
return 0;
}
-static struct file_operations dlmfs_file_operations = {
+static const struct file_operations dlmfs_file_operations = {
.open = dlmfs_file_open,
.release = dlmfs_file_release,
+ .poll = dlmfs_file_poll,
.read = dlmfs_file_read,
.write = dlmfs_file_write,
+ .llseek = default_llseek,
};
-static struct inode_operations dlmfs_dir_inode_operations = {
+static const struct inode_operations dlmfs_dir_inode_operations = {
.create = dlmfs_create,
.lookup = simple_lookup,
.unlink = dlmfs_unlink,
};
/* this way we can restrict mkdir to only the toplevel of the fs. */
-static struct inode_operations dlmfs_root_inode_operations = {
+static const struct inode_operations dlmfs_root_inode_operations = {
.lookup = simple_lookup,
.mkdir = dlmfs_mkdir,
.rmdir = simple_rmdir,
};
-static struct super_operations dlmfs_ops = {
+static const struct super_operations dlmfs_ops = {
.statfs = simple_statfs,
.alloc_inode = dlmfs_alloc_inode,
.destroy_inode = dlmfs_destroy_inode,
- .clear_inode = dlmfs_clear_inode,
+ .evict_inode = dlmfs_evict_inode,
.drop_inode = generic_delete_inode,
};
-static struct inode_operations dlmfs_file_inode_operations = {
+static const struct inode_operations dlmfs_file_inode_operations = {
.getattr = simple_getattr,
+ .setattr = dlmfs_file_setattr,
};
-static int dlmfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
}
static struct file_system_type dlmfs_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2_dlmfs",
- .get_sb = dlmfs_get_sb,
+ .mount = dlmfs_mount,
.kill_sb = kill_litter_super,
};
+MODULE_ALIAS_FS("ocfs2_dlmfs");
static int __init init_dlmfs_fs(void)
{
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- dlmfs_print_version();
+ status = bdi_init(&dlmfs_backing_dev_info);
+ if (status)
+ return status;
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- dlmfs_init_once, NULL);
- if (!dlmfs_inode_cache)
- return -ENOMEM;
+ dlmfs_init_once);
+ if (!dlmfs_inode_cache) {
+ status = -ENOMEM;
+ goto bail;
+ }
cleanup_inode = 1;
user_dlm_worker = create_singlethread_workqueue("user_dlm");
@@ -608,6 +665,7 @@ static int __init init_dlmfs_fs(void)
}
cleanup_worker = 1;
+ user_dlm_set_locking_protocol();
status = register_filesystem(&dlmfs_fs_type);
bail:
if (status) {
@@ -615,6 +673,7 @@ bail:
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
+ bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
@@ -627,11 +686,19 @@ static void __exit exit_dlmfs_fs(void)
flush_workqueue(user_dlm_worker);
destroy_workqueue(user_dlm_worker);
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
+
+ bdi_destroy(&dlmfs_backing_dev_info);
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index eead48bbfac..0499e3fb7bd 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
#include <linux/types.h>
#include <linux/crc32.h>
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
+#include "ocfs2_lockingver.h"
+#include "stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
+
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+ return container_of(lksb, struct user_lock_res, l_lksb);
+}
+
static inline int user_check_wait_flag(struct user_lock_res *lockres,
int flag)
{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
}
/* I heart container_of... */
-static inline struct dlm_ctxt *
-dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+static inline struct ocfs2_cluster_connection *
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
{
struct dlmfs_inode_private *ip;
ip = container_of(lockres,
struct dlmfs_inode_private,
ip_lockres);
- return ip->ip_dlm;
+ return ip->ip_conn;
}
static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
}
#define user_log_dlm_error(_func, _stat, _lockres) do { \
- mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
- "resource %.*s: %s\n", dlm_errname(_stat), _func, \
- _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
+ mlog(ML_ERROR, "Dlm error %d while calling %s on " \
+ "resource %.*s\n", _stat, _func, \
+ _lockres->l_namelen, _lockres->l_name); \
} while (0)
/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
* lock types are added. */
static inline int user_highest_compat_lock_level(int level)
{
- int new_level = LKM_EXMODE;
+ int new_level = DLM_LOCK_EX;
- if (level == LKM_EXMODE)
- new_level = LKM_NLMODE;
- else if (level == LKM_PRMODE)
- new_level = LKM_PRMODE;
+ if (level == DLM_LOCK_EX)
+ new_level = DLM_LOCK_NL;
+ else if (level == DLM_LOCK_PR)
+ new_level = DLM_LOCK_PR;
return new_level;
}
-static void user_ast(void *opaque)
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
{
- struct user_lock_res *lockres = opaque;
- struct dlm_lockstatus *lksb;
+ struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+ int status;
- mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
- lockres->l_name);
+ mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
+ lockres->l_namelen, lockres->l_name, lockres->l_level,
+ lockres->l_requested);
spin_lock(&lockres->l_lock);
- lksb = &(lockres->l_lksb);
- if (lksb->status != DLM_NORMAL) {
+ status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+ if (status) {
mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
- lksb->status, lockres->l_namelen, lockres->l_name);
+ status, lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
return;
}
- mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+ mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
"Lockres %.*s, requested ivmode. flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
if (lockres->l_requested < lockres->l_level) {
if (lockres->l_requested <=
user_highest_compat_lock_level(lockres->l_blocking)) {
- lockres->l_blocking = LKM_NLMODE;
+ lockres->l_blocking = DLM_LOCK_NL;
lockres->l_flags &= ~USER_LOCK_BLOCKED;
}
}
lockres->l_level = lockres->l_requested;
- lockres->l_requested = LKM_IVMODE;
+ lockres->l_requested = DLM_LOCK_IV;
lockres->l_flags |= USER_LOCK_ATTACHED;
lockres->l_flags &= ~USER_LOCK_BUSY;
@@ -171,15 +173,14 @@ static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
BUG();
}
-static void user_dlm_unblock_lock(void *opaque);
+static void user_dlm_unblock_lock(struct work_struct *work);
static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
{
if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
user_dlm_grab_inode_ref(lockres);
- INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
- lockres);
+ INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
queue_work(user_dlm_worker, &lockres->l_work);
lockres->l_flags |= USER_LOCK_QUEUED;
@@ -194,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
return;
switch (lockres->l_blocking) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
queue = 1;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
if (!lockres->l_ex_holders)
queue = 1;
break;
@@ -210,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
__user_dlm_queue_lockres(lockres);
}
-static void user_bast(void *opaque, int level)
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
{
- struct user_lock_res *lockres = opaque;
+ struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
- mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
- lockres->l_namelen, lockres->l_name, level);
+ mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
+ lockres->l_namelen, lockres->l_name, level, lockres->l_level);
spin_lock(&lockres->l_lock);
lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -228,15 +229,15 @@ static void user_bast(void *opaque, int level)
wake_up(&lockres->l_event);
}
-static void user_unlock_ast(void *opaque, enum dlm_status status)
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
{
- struct user_lock_res *lockres = opaque;
+ struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
- mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
- lockres->l_name);
+ mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
+ lockres->l_namelen, lockres->l_name, lockres->l_flags);
- if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
- mlog(ML_ERROR, "Dlm returns status %d\n", status);
+ if (status)
+ mlog(ML_ERROR, "dlm returns status %d\n", status);
spin_lock(&lockres->l_lock);
/* The teardown flag gets set early during the unlock process,
@@ -244,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
* for a concurrent cancel. */
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
&& !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
- lockres->l_level = LKM_IVMODE;
+ lockres->l_level = DLM_LOCK_IV;
} else if (status == DLM_CANCELGRANT) {
/* We tried to cancel a convert request, but it was
* already granted. Don't clear the busy flag - the
@@ -255,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
} else {
BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
/* Cancel succeeded, we want to re-queue */
- lockres->l_requested = LKM_IVMODE; /* cancel an
+ lockres->l_requested = DLM_LOCK_IV; /* cancel an
* upconvert
* request. */
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -272,6 +273,21 @@ out_noclear:
wake_up(&lockres->l_event);
}
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+ .lp_max_version = {
+ .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+ .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+ },
+ .lp_lock_ast = user_ast,
+ .lp_blocking_ast = user_bast,
+ .lp_unlock_ast = user_unlock_ast,
+};
+
static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
{
struct inode *inode;
@@ -279,14 +295,15 @@ static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
iput(inode);
}
-static void user_dlm_unblock_lock(void *opaque)
+static void user_dlm_unblock_lock(struct work_struct *work)
{
int new_level, status;
- struct user_lock_res *lockres = (struct user_lock_res *) opaque;
- struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+ struct user_lock_res *lockres =
+ container_of(work, struct user_lock_res, l_work);
+ struct ocfs2_cluster_connection *conn =
+ cluster_connection_from_user_lockres(lockres);
- mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
- lockres->l_name);
+ mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(void *opaque)
* flag, and finally we might get another bast which re-queues
* us before our ast for the downconvert is called. */
if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+ mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+ lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+ lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_BUSY) {
if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+ mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+ lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(void *opaque)
lockres->l_flags |= USER_LOCK_IN_CANCEL;
spin_unlock(&lockres->l_lock);
- status = dlmunlock(dlm,
- &lockres->l_lksb,
- LKM_CANCEL,
- user_unlock_ast,
- lockres);
- if (status != DLM_NORMAL)
- user_log_dlm_error("dlmunlock", status, lockres);
+ status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
+ DLM_LKF_CANCEL);
+ if (status)
+ user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto drop_ref;
}
/* If there are still incompat holders, we can exit safely
* without worrying about re-queueing this lock as that will
* happen on the last call to user_cluster_unlock. */
- if ((lockres->l_blocking == LKM_EXMODE)
+ if ((lockres->l_blocking == DLM_LOCK_EX)
&& (lockres->l_ex_holders || lockres->l_ro_holders)) {
spin_unlock(&lockres->l_lock);
- mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
- lockres->l_ro_holders, lockres->l_ex_holders);
+ mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
+ lockres->l_namelen, lockres->l_name,
+ lockres->l_ex_holders, lockres->l_ro_holders);
goto drop_ref;
}
- if ((lockres->l_blocking == LKM_PRMODE)
+ if ((lockres->l_blocking == DLM_LOCK_PR)
&& lockres->l_ex_holders) {
spin_unlock(&lockres->l_lock);
- mlog(0, "can't downconvert for pr: ex = %u\n",
- lockres->l_ex_holders);
+ mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
+ lockres->l_namelen, lockres->l_name,
+ lockres->l_ex_holders);
goto drop_ref;
}
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(void *opaque)
new_level = user_highest_compat_lock_level(lockres->l_blocking);
lockres->l_requested = new_level;
lockres->l_flags |= USER_LOCK_BUSY;
- mlog(0, "Downconvert lock from %d to %d\n",
- lockres->l_level, new_level);
+ mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
+ lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
spin_unlock(&lockres->l_lock);
/* need lock downconvert request now... */
- status = dlmlock(dlm,
- new_level,
- &lockres->l_lksb,
- LKM_CONVERT|LKM_VALBLK,
- lockres->l_name,
- lockres->l_namelen,
- user_ast,
- lockres,
- user_bast);
- if (status != DLM_NORMAL) {
- user_log_dlm_error("dlmlock", status, lockres);
+ status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
+ DLM_LKF_CONVERT|DLM_LKF_VALBLK,
+ lockres->l_name,
+ lockres->l_namelen);
+ if (status) {
+ user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
user_recover_from_dlm_error(lockres);
}
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
lockres->l_ex_holders++;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
lockres->l_ro_holders++;
break;
default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
int lkm_flags)
{
int status, local_flags;
- struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+ struct ocfs2_cluster_connection *conn =
+ cluster_connection_from_user_lockres(lockres);
- if (level != LKM_EXMODE &&
- level != LKM_PRMODE) {
+ if (level != DLM_LOCK_EX &&
+ level != DLM_LOCK_PR) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
status = -EINVAL;
goto bail;
}
- mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
- lockres->l_namelen, lockres->l_name,
- (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
- lkm_flags);
+ mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
+ lockres->l_namelen, lockres->l_name, level, lkm_flags);
again:
if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
}
if (level > lockres->l_level) {
- local_flags = lkm_flags | LKM_VALBLK;
- if (lockres->l_level != LKM_IVMODE)
- local_flags |= LKM_CONVERT;
+ local_flags = lkm_flags | DLM_LKF_VALBLK;
+ if (lockres->l_level != DLM_LOCK_IV)
+ local_flags |= DLM_LKF_CONVERT;
lockres->l_requested = level;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
- BUG_ON(level == LKM_IVMODE);
- BUG_ON(level == LKM_NLMODE);
+ BUG_ON(level == DLM_LOCK_IV);
+ BUG_ON(level == DLM_LOCK_NL);
/* call dlm_lock to upgrade lock now */
- status = dlmlock(dlm,
- level,
- &lockres->l_lksb,
- local_flags,
- lockres->l_name,
- lockres->l_namelen,
- user_ast,
- lockres,
- user_bast);
- if (status != DLM_NORMAL) {
- if ((lkm_flags & LKM_NOQUEUE) &&
- (status == DLM_NOTQUEUED))
- status = -EAGAIN;
- else {
- user_log_dlm_error("dlmlock", status, lockres);
- status = -EINVAL;
- }
+ status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
+ local_flags, lockres->l_name,
+ lockres->l_namelen);
+ if (status) {
+ if ((lkm_flags & DLM_LKF_NOQUEUE) &&
+ (status != -EAGAIN))
+ user_log_dlm_error("ocfs2_dlm_lock",
+ status, lockres);
user_recover_from_dlm_error(lockres);
goto bail;
}
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
BUG_ON(!lockres->l_ex_holders);
lockres->l_ex_holders--;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
BUG_ON(!lockres->l_ro_holders);
lockres->l_ro_holders--;
break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
int level)
{
- if (level != LKM_EXMODE &&
- level != LKM_PRMODE) {
+ if (level != DLM_LOCK_EX &&
+ level != DLM_LOCK_PR) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
- char *lvb = lockres->l_lksb.lvb;
+ char *lvb;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
- BUG_ON(lockres->l_level < LKM_EXMODE);
+ BUG_ON(lockres->l_level < DLM_LOCK_EX);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
memcpy(lvb, val, len);
spin_unlock(&lockres->l_lock);
}
-void user_dlm_read_lvb(struct inode *inode,
- char *val,
- unsigned int len)
+ssize_t user_dlm_read_lvb(struct inode *inode,
+ char *val,
+ unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
- char *lvb = lockres->l_lksb.lvb;
+ char *lvb;
+ ssize_t ret = len;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
- BUG_ON(lockres->l_level < LKM_PRMODE);
- memcpy(val, lvb, len);
+ BUG_ON(lockres->l_level < DLM_LOCK_PR);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ memcpy(val, lvb, len);
+ } else
+ ret = 0;
spin_unlock(&lockres->l_lock);
+ return ret;
}
void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
spin_lock_init(&lockres->l_lock);
init_waitqueue_head(&lockres->l_event);
- lockres->l_level = LKM_IVMODE;
- lockres->l_requested = LKM_IVMODE;
- lockres->l_blocking = LKM_IVMODE;
+ lockres->l_level = DLM_LOCK_IV;
+ lockres->l_requested = DLM_LOCK_IV;
+ lockres->l_blocking = DLM_LOCK_IV;
/* should have been checked before getting here. */
BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
int user_dlm_destroy_lock(struct user_lock_res *lockres)
{
int status = -EBUSY;
- struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+ struct ocfs2_cluster_connection *conn =
+ cluster_connection_from_user_lockres(lockres);
- mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
+ mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
- status = dlmunlock(dlm,
- &lockres->l_lksb,
- LKM_VALBLK,
- user_unlock_ast,
- lockres);
- if (status != DLM_NORMAL) {
- user_log_dlm_error("dlmunlock", status, lockres);
- status = -EINVAL;
+ status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
+ if (status) {
+ user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
@@ -645,31 +655,34 @@ bail:
return status;
}
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
+static void user_dlm_recovery_handler_noop(int node_num,
+ void *recovery_data)
{
- struct dlm_ctxt *dlm;
- u32 dlm_key;
- char *domain;
-
- domain = kmalloc(name->len + 1, GFP_NOFS);
- if (!domain) {
- mlog_errno(-ENOMEM);
- return ERR_PTR(-ENOMEM);
- }
+ /* We ignore recovery events */
+ return;
+}
- dlm_key = crc32_le(0, name->name, name->len);
+void user_dlm_set_locking_protocol(void)
+{
+ ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
- snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+ int rc;
+ struct ocfs2_cluster_connection *conn;
- dlm = dlm_register_domain(domain, dlm_key);
- if (IS_ERR(dlm))
- mlog_errno(PTR_ERR(dlm));
+ rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
+ &user_dlm_lproto,
+ user_dlm_recovery_handler_noop,
+ NULL, &conn);
+ if (rc)
+ mlog_errno(rc);
- kfree(domain);
- return dlm;
+ return rc ? ERR_PTR(rc) : conn;
}
-void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
{
- dlm_unregister_domain(dlm);
+ ocfs2_cluster_disconnect(conn, 0);
}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index c400e93bbf7..3b42d79531d 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -33,7 +33,7 @@
#include <linux/workqueue.h>
/* user_lock_res->l_flags flags. */
-#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized
+#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized
* the lvb */
#define USER_LOCK_BUSY (0x00000002) /* we are currently in
* dlm_lock */
@@ -57,7 +57,7 @@ struct user_lock_res {
int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct dlm_lockstatus l_lksb;
+ struct ocfs2_dlm_lksb l_lksb;
int l_requested;
int l_blocking;
@@ -80,14 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len);
-void user_dlm_read_lvb(struct inode *inode,
- char *val,
- unsigned int len);
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
-void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+ssize_t user_dlm_read_lvb(struct inode *inode,
+ char *val,
+ unsigned int len);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
+void user_dlm_set_locking_protocol(void);
struct dlmfs_inode_private {
- struct dlm_ctxt *ip_dlm;
+ struct ocfs2_cluster_connection *ip_conn;
struct user_lock_res ip_lockres; /* unused for directories. */
struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8801e41afe8..52cfe99ae05 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,35 +27,33 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/mm.h>
-#include <linux/smp_lock.h>
-#include <linux/crc32.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
-
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
-#include <dlm/dlmapi.h>
+#include <linux/time.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
#include "ocfs2.h"
+#include "ocfs2_lockingver.h"
#include "alloc.h"
#include "dcache.h"
#include "dlmglue.h"
#include "extent_map.h"
+#include "file.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
+#include "stackglue.h"
#include "slot_map.h"
#include "super.h"
#include "uptodate.h"
-#include "vote.h"
+#include "quota.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -65,10 +63,15 @@ struct ocfs2_mask_waiter {
struct completion mw_complete;
unsigned long mw_mask;
unsigned long mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+ ktime_t mw_lock_start;
+#endif
};
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
/*
* Return value from ->downconvert_worker functions.
@@ -90,6 +93,9 @@ struct ocfs2_unblock_ctl {
enum ocfs2_unblock_action unblock_action;
};
+/* Lockdep class keys */
+struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
@@ -103,6 +109,41 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
+
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+ int blocking);
+
+#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
+
+/* This aids in debugging situations where a bad LVB might be involved. */
+static void ocfs2_dump_meta_lvb_info(u64 level,
+ const char *function,
+ unsigned int line,
+ struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+
+ mlog(level, "LVB information for %s (called from %s:%u):\n",
+ lockres->l_name, function, line);
+ mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
+ lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+ be32_to_cpu(lvb->lvb_igeneration));
+ mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
+ (unsigned long long)be64_to_cpu(lvb->lvb_isize),
+ be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
+ be16_to_cpu(lvb->lvb_imode));
+ mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
+ "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
+ (long long)be64_to_cpu(lvb->lvb_iatime_packed),
+ (long long)be64_to_cpu(lvb->lvb_ictime_packed),
+ (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+ be32_to_cpu(lvb->lvb_iattr));
+}
+
+
/*
* OCFS2 Lock Resource Operations
*
@@ -124,10 +165,10 @@ struct ocfs2_lock_res_ops {
struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
/*
- * Optionally called in the downconvert (or "vote") thread
- * after a successful downconvert. The lockres will not be
- * referenced after this callback is called, so it is safe to
- * free memory, etc.
+ * Optionally called in the downconvert thread after a
+ * successful downconvert. The lockres will not be referenced
+ * after this callback is called, so it is safe to free
+ * memory, etc.
*
* The exact semantics of when this is called are controlled
* by ->downconvert_worker()
@@ -196,17 +237,12 @@ static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
.flags = 0,
};
-static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
+static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
.get_osb = ocfs2_get_inode_osb,
.check_downconvert = ocfs2_check_meta_downconvert,
.set_lvb = ocfs2_set_meta_lvb,
- .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
-};
-
-static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
- .get_osb = ocfs2_get_inode_osb,
.downconvert_worker = ocfs2_data_convert_worker,
- .flags = 0,
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
};
static struct ocfs2_lock_res_ops ocfs2_super_lops = {
@@ -217,6 +253,14 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+ .flags = 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -224,11 +268,38 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+ .get_osb = ocfs2_get_inode_osb,
+ .flags = 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+ .get_osb = ocfs2_get_file_osb,
+ .flags = 0,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+ .set_lvb = ocfs2_set_qinfo_lvb,
+ .get_osb = ocfs2_get_qinfo_osb,
+ .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+ .check_downconvert = ocfs2_check_refcount_downconvert,
+ .downconvert_worker = ocfs2_refcount_convert_worker,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
- lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
- lockres->l_type == OCFS2_LOCK_TYPE_RW;
+ lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+ lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
+}
+
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+ return container_of(lksb, struct ocfs2_lock_res, l_lksb);
}
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -245,6 +316,19 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
return (struct ocfs2_dentry_lock *)lockres->l_priv;
}
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+ BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+ return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
+static inline struct ocfs2_refcount_tree *
+ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
+{
+ return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
{
if (lockres->l_ops->get_osb)
@@ -256,12 +340,19 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
static int ocfs2_lock_create(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
- int dlm_flags);
+ u32 dlm_flags);
static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
int wanted);
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int level);
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level, unsigned long caller_ip);
+static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level)
+{
+ __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
+}
+
static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
@@ -270,17 +361,34 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
int convert);
-#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
- mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
- "resource %s: %s\n", dlm_errname(_stat), _func, \
- _lockres->l_name, dlm_errmsg(_stat)); \
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
+ if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \
+ mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
+ _err, _func, _lockres->l_name); \
+ else \
+ mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \
+ _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \
+ (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \
} while (0)
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres);
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_downconvert_thread(void *arg);
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh);
static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
static inline int ocfs2_highest_compat_lock_level(int level);
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int new_level,
+ int lvb,
+ unsigned int generation);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
u64 blkno,
@@ -289,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
{
int len;
- mlog_entry_void();
-
BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -300,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
mlog(0, "built lock resource with name: %s\n", name);
-
- mlog_exit_void();
}
static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -324,6 +428,71 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
spin_unlock(&ocfs2_dlm_tracking_lock);
}
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+ res->l_lock_refresh = 0;
+ memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+ memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+ struct ocfs2_mask_waiter *mw, int ret)
+{
+ u32 usec;
+ ktime_t kt;
+ struct ocfs2_lock_stats *stats;
+
+ if (level == LKM_PRMODE)
+ stats = &res->l_lock_prmode;
+ else if (level == LKM_EXMODE)
+ stats = &res->l_lock_exmode;
+ else
+ return;
+
+ kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+ usec = ktime_to_us(kt);
+
+ stats->ls_gets++;
+ stats->ls_total += ktime_to_ns(kt);
+ /* overflow */
+ if (unlikely(stats->ls_gets == 0)) {
+ stats->ls_gets++;
+ stats->ls_total = ktime_to_ns(kt);
+ }
+
+ if (stats->ls_max < usec)
+ stats->ls_max = usec;
+
+ if (ret)
+ stats->ls_fail++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+ lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+ mw->mw_lock_start = ktime_get();
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+ int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
@@ -334,15 +503,24 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
res->l_ops = ops;
res->l_priv = priv;
- res->l_level = LKM_IVMODE;
- res->l_requested = LKM_IVMODE;
- res->l_blocking = LKM_IVMODE;
+ res->l_level = DLM_LOCK_IV;
+ res->l_requested = DLM_LOCK_IV;
+ res->l_blocking = DLM_LOCK_IV;
res->l_action = OCFS2_AST_INVALID;
res->l_unlock_action = OCFS2_UNLOCK_INVALID;
res->l_flags = OCFS2_LOCK_INITIALIZED;
ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+ ocfs2_init_lock_stats(res);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (type != OCFS2_LOCK_TYPE_OPEN)
+ lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
+ &lockdep_keys[type], 0);
+ else
+ res->l_lockdep_map.key = NULL;
+#endif
}
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -367,10 +545,10 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
ops = &ocfs2_inode_rw_lops;
break;
case OCFS2_LOCK_TYPE_META:
- ops = &ocfs2_inode_meta_lops;
+ ops = &ocfs2_inode_inode_lops;
break;
- case OCFS2_LOCK_TYPE_DATA:
- ops = &ocfs2_inode_data_lops;
+ case OCFS2_LOCK_TYPE_OPEN:
+ ops = &ocfs2_inode_open_lops;
break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
@@ -390,6 +568,20 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
return OCFS2_SB(inode->i_sb);
}
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+ return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_file_private *fp = lockres->l_priv;
+
+ return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
+
static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
{
__be64 inode_blkno_be;
@@ -470,10 +662,65 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_rename_lops, osb);
}
-void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
{
- mlog_entry_void();
+ /* nfs_sync lockres doesn't come from a slab so we call init
+ * once on it manually. */
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+ &ocfs2_nfs_sync_lops, osb);
+}
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ &ocfs2_orphan_scan_lops, osb);
+}
+
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_file_private *fp)
+{
+ struct inode *inode = fp->fp_file->f_mapping->host;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+ inode->i_generation, lockres->l_name);
+ ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+ OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+ fp);
+ lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
+
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mem_dqinfo *info)
+{
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+ 0, lockres->l_name);
+ ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+ OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+ info);
+}
+
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_super *osb, u64 ref_blkno,
+ unsigned int generation)
+{
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+ generation, lockres->l_name);
+ ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+ &ocfs2_refcount_block_lops, osb);
+}
+
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
return;
@@ -499,50 +746,42 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
memset(&res->l_lksb, 0, sizeof(res->l_lksb));
res->l_flags = 0UL;
- mlog_exit_void();
}
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
- mlog_entry_void();
-
BUG_ON(!lockres);
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
lockres->l_ex_holders++;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
lockres->l_ro_holders++;
break;
default:
BUG();
}
-
- mlog_exit_void();
}
static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
int level)
{
- mlog_entry_void();
-
BUG_ON(!lockres);
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
BUG_ON(!lockres->l_ex_holders);
lockres->l_ex_holders--;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
BUG_ON(!lockres->l_ro_holders);
lockres->l_ro_holders--;
break;
default:
BUG();
}
- mlog_exit_void();
}
/* WARNING: This function lives in a world where the only three lock
@@ -550,27 +789,25 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
* lock types are added. */
static inline int ocfs2_highest_compat_lock_level(int level)
{
- int new_level = LKM_EXMODE;
+ int new_level = DLM_LOCK_EX;
- if (level == LKM_EXMODE)
- new_level = LKM_NLMODE;
- else if (level == LKM_PRMODE)
- new_level = LKM_PRMODE;
+ if (level == DLM_LOCK_EX)
+ new_level = DLM_LOCK_NL;
+ else if (level == DLM_LOCK_PR)
+ new_level = DLM_LOCK_PR;
return new_level;
}
static void lockres_set_flags(struct ocfs2_lock_res *lockres,
unsigned long newflags)
{
- struct list_head *pos, *tmp;
- struct ocfs2_mask_waiter *mw;
+ struct ocfs2_mask_waiter *mw, *tmp;
assert_spin_locked(&lockres->l_lock);
lockres->l_flags = newflags;
- list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
- mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+ list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
continue;
@@ -591,28 +828,22 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
- BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+ BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
lockres->l_level = lockres->l_requested;
if (lockres->l_level <=
ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
- lockres->l_blocking = LKM_NLMODE;
+ lockres->l_blocking = DLM_LOCK_NL;
lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
}
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
@@ -620,24 +851,28 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
* information is already up to data. Convert from NL to
* *anything* however should mark ourselves as needing an
* update */
- if (lockres->l_level == LKM_NLMODE &&
+ if (lockres->l_level == DLM_LOCK_NL &&
lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
lockres->l_level = lockres->l_requested;
- lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
- mlog_exit_void();
+ /*
+ * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
+ * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
+ * downconverting the lock before the upconvert has fully completed.
+ */
+ lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
+ lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
}
static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
- BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
+ BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
- if (lockres->l_requested > LKM_NLMODE &&
+ if (lockres->l_requested > DLM_LOCK_NL &&
!(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -645,20 +880,15 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
lockres->l_level = lockres->l_requested;
lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
int level)
{
int needs_downconvert = 0;
- mlog_entry_void();
assert_spin_locked(&lockres->l_lock);
- lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-
if (level > lockres->l_blocking) {
/* only schedule a downconvert if we haven't already scheduled
* one that goes low enough to satisfy the level we're
@@ -671,23 +901,142 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
lockres->l_blocking = level;
}
- mlog_exit(needs_downconvert);
+ mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+ lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+ needs_downconvert);
+
+ if (needs_downconvert)
+ lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+ mlog(0, "needs_downconvert = %d\n", needs_downconvert);
return needs_downconvert;
}
-static void ocfs2_blocking_ast(void *opaque, int level)
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
+ * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again. If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action? The other path has re-set PENDING. Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ * ocfs2_cluster_lock()
+ * set BUSY
+ * set PENDING
+ * drop l_lock
+ * ocfs2_dlm_lock()
+ * ocfs2_locking_ast() ocfs2_downconvert_thread()
+ * clear PENDING ocfs2_unblock_lock()
+ * take_l_lock
+ * !BUSY
+ * ocfs2_prepare_downconvert()
+ * set BUSY
+ * set PENDING
+ * drop l_lock
+ * take l_lock
+ * clear PENDING
+ * drop l_lock
+ * <window>
+ * ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert(). That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen. A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres. lockres_set_pending() will return the
+ * current generation number. When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending(). In our
+ * example above, the generation numbers will *not* match. Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+ unsigned int generation,
+ struct ocfs2_super *osb)
+{
+ assert_spin_locked(&lockres->l_lock);
+
+ /*
+ * The ast and locking functions can race us here. The winner
+ * will clear pending, the loser will not.
+ */
+ if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+ (lockres->l_pending_gen != generation))
+ return;
+
+ lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+ lockres->l_pending_gen++;
+
+ /*
+ * The downconvert thread may have skipped us because we
+ * were PENDING. Wake it up.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+ ocfs2_wake_downconvert_thread(osb);
+}
+
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+ unsigned int generation,
+ struct ocfs2_super *osb)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ __lockres_clear_pending(lockres, generation, osb);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
{
- struct ocfs2_lock_res *lockres = opaque;
+ assert_spin_locked(&lockres->l_lock);
+ BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+ lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+
+ return lockres->l_pending_gen;
+}
+
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
+{
+ struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
int needs_downconvert;
unsigned long flags;
- BUG_ON(level <= LKM_NLMODE);
+ BUG_ON(level <= DLM_LOCK_NL);
- mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
- lockres->l_name, level, lockres->l_level,
+ mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
+ "type %s\n", lockres->l_name, level, lockres->l_level,
ocfs2_lock_type_string(lockres->l_type));
+ /*
+ * We can skip the bast for locks which don't enable caching -
+ * they'll be dropped at the earliest possible time anyway.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+ return;
+
spin_lock_irqsave(&lockres->l_lock, flags);
needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
if (needs_downconvert)
@@ -696,24 +1045,36 @@ static void ocfs2_blocking_ast(void *opaque, int level)
wake_up(&lockres->l_event);
- ocfs2_kick_vote_thread(osb);
+ ocfs2_wake_downconvert_thread(osb);
}
-static void ocfs2_locking_ast(void *opaque)
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
{
- struct ocfs2_lock_res *lockres = opaque;
- struct dlm_lockstatus *lksb = &lockres->l_lksb;
+ struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+ struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
unsigned long flags;
+ int status;
spin_lock_irqsave(&lockres->l_lock, flags);
- if (lksb->status != DLM_NORMAL) {
- mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
- lockres->l_name, lksb->status);
+ status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+
+ if (status == -EAGAIN) {
+ lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+ goto out;
+ }
+
+ if (status) {
+ mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+ lockres->l_name, status);
spin_unlock_irqrestore(&lockres->l_lock, flags);
return;
}
+ mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+ "level %d => %d\n", lockres->l_name, lockres->l_action,
+ lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
+
switch(lockres->l_action) {
case OCFS2_AST_ATTACH:
ocfs2_generic_handle_attach_action(lockres);
@@ -726,29 +1087,118 @@ static void ocfs2_locking_ast(void *opaque)
ocfs2_generic_handle_downconvert_action(lockres);
break;
default:
- mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
- "lockres flags = 0x%lx, unlock action: %u\n",
+ mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
+ "flags 0x%lx, unlock: %u\n",
lockres->l_name, lockres->l_action, lockres->l_flags,
lockres->l_unlock_action);
BUG();
}
-
+out:
/* set it to something invalid so if we get called again we
* can catch it. */
lockres->l_action = OCFS2_AST_INVALID;
+ /* Did we try to cancel this lock? Clear that state */
+ if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+ /*
+ * We may have beaten the locking functions here. We certainly
+ * know that dlm_lock() has been called :-)
+ * Because we can't have two lock calls in flight at once, we
+ * can use lockres->l_pending_gen.
+ */
+ __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
+
wake_up(&lockres->l_event);
spin_unlock_irqrestore(&lockres->l_lock, flags);
}
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+ struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+ unsigned long flags;
+
+ mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+ lockres->l_name, lockres->l_unlock_action);
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (error) {
+ mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+ "unlock_action %d\n", error, lockres->l_name,
+ lockres->l_unlock_action);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ return;
+ }
+
+ switch(lockres->l_unlock_action) {
+ case OCFS2_UNLOCK_CANCEL_CONVERT:
+ mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+ lockres->l_action = OCFS2_AST_INVALID;
+ /* Downconvert thread may have requeued this lock, we
+ * need to wake it. */
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+ ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+ break;
+ case OCFS2_UNLOCK_DROP_LOCK:
+ lockres->l_level = DLM_LOCK_IV;
+ break;
+ default:
+ BUG();
+ }
+
+ lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+ wake_up(&lockres->l_event);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+/*
+ * This is the filesystem locking protocol. It provides the lock handling
+ * hooks for the underlying DLM. It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed. The protocol is negotiated when joining
+ * the dlm domain. A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes. When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero. If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased. If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+ .lp_max_version = {
+ .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+ .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+ },
+ .lp_lock_ast = ocfs2_locking_ast,
+ .lp_blocking_ast = ocfs2_blocking_ast,
+ .lp_unlock_ast = ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+ ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
+
static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
int convert)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+ lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
if (convert)
lockres->l_action = OCFS2_AST_INVALID;
else
@@ -756,7 +1206,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
spin_unlock_irqrestore(&lockres->l_lock, flags);
wake_up(&lockres->l_event);
- mlog_exit_void();
}
/* Note: If we detect another process working on the lock (i.e.,
@@ -766,15 +1215,13 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
static int ocfs2_lock_create(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int level,
- int dlm_flags)
+ u32 dlm_flags)
{
int ret = 0;
- enum dlm_status status;
unsigned long flags;
+ unsigned int gen;
- mlog_entry_void();
-
- mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+ mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
dlm_flags);
spin_lock_irqsave(&lockres->l_lock, flags);
@@ -787,27 +1234,24 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
lockres->l_action = OCFS2_AST_ATTACH;
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+ gen = lockres_set_pending(lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- status = dlmlock(osb->dlm,
- level,
- &lockres->l_lksb,
- dlm_flags,
- lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast,
- lockres,
- ocfs2_blocking_ast);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmlock", status, lockres);
- ret = -EINVAL;
+ ret = ocfs2_dlm_lock(osb->cconn,
+ level,
+ &lockres->l_lksb,
+ dlm_flags,
+ lockres->l_name,
+ OCFS2_LOCK_ID_MAX_LEN - 1);
+ lockres_clear_pending(lockres, gen, osb);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 1);
}
- mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+ mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
bail:
- mlog_exit(ret);
return ret;
}
@@ -853,13 +1297,14 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
{
INIT_LIST_HEAD(&mw->mw_item);
init_completion(&mw->mw_complete);
+ ocfs2_init_start_time(mw);
}
static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
{
wait_for_completion(&mw->mw_complete);
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return mw->mw_status;
}
@@ -899,35 +1344,51 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
}
-static int ocfs2_cluster_lock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int level,
- int lkm_flags,
- int arg_flags)
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+ struct ocfs2_lock_res *lockres)
+{
+ int ret;
+
+ ret = wait_for_completion_interruptible(&mw->mw_complete);
+ if (ret)
+ lockres_remove_mask_waiter(lockres, mw);
+ else
+ ret = mw->mw_status;
+ /* Re-arm the completion in case we want to wait on it again */
+ reinit_completion(&mw->mw_complete);
+ return ret;
+}
+
+static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level,
+ u32 lkm_flags,
+ int arg_flags,
+ int l_subclass,
+ unsigned long caller_ip)
{
struct ocfs2_mask_waiter mw;
- enum dlm_status status;
int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
unsigned long flags;
-
- mlog_entry_void();
+ unsigned int gen;
+ int noqueue_attempted = 0;
ocfs2_init_mask_waiter(&mw);
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
- lkm_flags |= LKM_VALBLK;
+ lkm_flags |= DLM_LKF_VALBLK;
again:
wait = 0;
+ spin_lock_irqsave(&lockres->l_lock, flags);
+
if (catch_signals && signal_pending(current)) {
ret = -ERESTARTSYS;
- goto out;
+ goto unlock;
}
- spin_lock_irqsave(&lockres->l_lock, flags);
-
mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
"Cluster lock called on freeing lockres %s! flags "
"0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -944,16 +1405,23 @@ again:
goto unlock;
}
- if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
- /* lock has not been created yet. */
- spin_unlock_irqrestore(&lockres->l_lock, flags);
-
- ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- goto again;
+ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
+ /*
+ * We've upconverted. If the lock now has a level we can
+ * work with, we take it. If, however, the lock is not at the
+ * required level, we go thru the full cycle. One way this could
+ * happen is if a process requesting an upconvert to PR is
+ * closely followed by another requesting upconvert to an EX.
+ * If the process requesting EX lands here, we want it to
+ * continue attempting to upconvert and let the process
+ * requesting PR take the lock.
+ * If multiple processes request upconvert to PR, the first one
+ * here will take the lock. The others will have to go thru the
+ * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
+ * downconvert request.
+ */
+ if (level <= lockres->l_level)
+ goto update_holders;
}
if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
@@ -966,45 +1434,55 @@ again:
}
if (level > lockres->l_level) {
+ if (noqueue_attempted > 0) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ if (lkm_flags & DLM_LKF_NOQUEUE)
+ noqueue_attempted = 1;
+
if (lockres->l_action != OCFS2_AST_INVALID)
mlog(ML_ERROR, "lockres %s has action %u pending\n",
lockres->l_name, lockres->l_action);
- lockres->l_action = OCFS2_AST_CONVERT;
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+ lockres->l_action = OCFS2_AST_ATTACH;
+ lkm_flags &= ~DLM_LKF_CONVERT;
+ } else {
+ lockres->l_action = OCFS2_AST_CONVERT;
+ lkm_flags |= DLM_LKF_CONVERT;
+ }
+
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+ gen = lockres_set_pending(lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- BUG_ON(level == LKM_IVMODE);
- BUG_ON(level == LKM_NLMODE);
+ BUG_ON(level == DLM_LOCK_IV);
+ BUG_ON(level == DLM_LOCK_NL);
- mlog(0, "lock %s, convert from %d to level = %d\n",
+ mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
lockres->l_name, lockres->l_level, level);
/* call dlm_lock to upgrade lock now */
- status = dlmlock(osb->dlm,
- level,
- &lockres->l_lksb,
- lkm_flags|LKM_CONVERT,
- lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast,
- lockres,
- ocfs2_blocking_ast);
- if (status != DLM_NORMAL) {
- if ((lkm_flags & LKM_NOQUEUE) &&
- (status == DLM_NOTQUEUED))
- ret = -EAGAIN;
- else {
- ocfs2_log_dlm_error("dlmlock", status,
- lockres);
- ret = -EINVAL;
+ ret = ocfs2_dlm_lock(osb->cconn,
+ level,
+ &lockres->l_lksb,
+ lkm_flags,
+ lockres->l_name,
+ OCFS2_LOCK_ID_MAX_LEN - 1);
+ lockres_clear_pending(lockres, gen, osb);
+ if (ret) {
+ if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
+ (ret != -EAGAIN)) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock",
+ ret, lockres);
}
ocfs2_recover_from_dlm_error(lockres, 1);
goto out;
}
- mlog(0, "lock %s, successfull return from dlmlock\n",
+ mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
lockres->l_name);
/* At this point we've gone inside the dlm and need to
@@ -1015,11 +1493,14 @@ again:
goto again;
}
+update_holders:
/* Ok, if we get here then we're good to go. */
ocfs2_inc_holders(lockres, level);
ret = 0;
unlock:
+ lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
out:
/*
@@ -1044,33 +1525,59 @@ out:
goto again;
mlog_errno(ret);
}
+ ocfs2_update_lock_stats(lockres, level, &mw, ret);
- mlog_exit(ret);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!ret && lockres->l_lockdep_map.key != NULL) {
+ if (level == DLM_LOCK_PR)
+ rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
+ !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+ caller_ip);
+ else
+ rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
+ !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+ caller_ip);
+ }
+#endif
return ret;
}
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int level)
+static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level,
+ u32 lkm_flags,
+ int arg_flags)
+{
+ return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
+ 0, _RET_IP_);
+}
+
+
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level,
+ unsigned long caller_ip)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
ocfs2_dec_holders(lockres, level);
- ocfs2_vote_on_unlock(osb, lockres);
+ ocfs2_downconvert_on_unlock(osb, lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- mlog_exit_void();
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (lockres->l_lockdep_map.key != NULL)
+ rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+#endif
}
-int ocfs2_create_new_lock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int ex,
- int local)
+static int ocfs2_create_new_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int ex,
+ int local)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
unsigned long flags;
- int lkm_flags = local ? LKM_LOCAL : 0;
+ u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
spin_lock_irqsave(&lockres->l_lock, flags);
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1094,8 +1601,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
BUG_ON(!inode);
BUG_ON(!ocfs2_inode_is_new(inode));
- mlog_entry_void();
-
mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
/* NOTE: That we don't increment any of the holder counts, nor
@@ -1113,23 +1618,22 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
/*
- * We don't want to use LKM_LOCAL on a meta data lock as they
+ * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
* don't use a generation in their lock names.
*/
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}
bail:
- mlog_exit(ret);
return ret;
}
@@ -1137,118 +1641,357 @@ int ocfs2_rw_lock(struct inode *inode, int write)
{
int status, level;
struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu take %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
+ if (ocfs2_mount_local(osb))
+ return 0;
+
lockres = &OCFS2_I(inode)->ip_rw_lockres;
- level = write ? LKM_EXMODE : LKM_PRMODE;
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
0);
if (status < 0)
mlog_errno(status);
- mlog_exit(status);
return status;
}
void ocfs2_rw_unlock(struct inode *inode, int write)
{
- int level = write ? LKM_EXMODE : LKM_PRMODE;
+ int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
-
- mlog_entry_void();
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog(0, "inode %llu drop %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog(0, "inode %llu take PRMODE open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ DLM_LOCK_PR, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
- mlog_exit_void();
+out:
+ return status;
}
-int ocfs2_data_lock_full(struct inode *inode,
- int write,
- int arg_flags)
+int ocfs2_try_open_lock(struct inode *inode, int write)
{
int status = 0, level;
struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!inode);
- mlog_entry_void();
-
- mlog(0, "inode %llu take %s DATA lock\n",
+ mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
- /* We'll allow faking a readonly data lock for
- * rodevices. */
- if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
- if (write) {
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (write)
status = -EROFS;
- mlog_errno(status);
- }
goto out;
}
- lockres = &OCFS2_I(inode)->ip_data_lockres;
+ if (ocfs2_mount_local(osb))
+ goto out;
- level = write ? LKM_EXMODE : LKM_PRMODE;
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
- status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
- 0, arg_flags);
- if (status < 0 && status != -EAGAIN)
- mlog_errno(status);
+ level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ /*
+ * The file system may already holding a PRMODE/EXMODE open lock.
+ * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
+ * other nodes and the -EAGAIN will indicate to the caller that
+ * this inode is still in use.
+ */
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ level, DLM_LKF_NOQUEUE, 0);
out:
- mlog_exit(status);
return status;
}
-/* see ocfs2_meta_lock_with_page() */
-int ocfs2_data_lock_with_page(struct inode *inode,
- int write,
- struct page *page)
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog(0, "inode %llu drop open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ if(lockres->l_ro_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ DLM_LOCK_PR);
+ if(lockres->l_ex_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ DLM_LOCK_EX);
+
+out:
+ return;
+}
+
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
+ int level)
{
int ret;
+ struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+ unsigned long flags;
+ struct ocfs2_mask_waiter mw;
- ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
- if (ret == -EAGAIN) {
- unlock_page(page);
- if (ocfs2_data_lock(inode, write) == 0)
- ocfs2_data_unlock(inode, write);
- ret = AOP_TRUNCATED_PAGE;
+ ocfs2_init_mask_waiter(&mw);
+
+retry_cancel:
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+ ret = ocfs2_prepare_cancel_convert(osb, lockres);
+ if (ret) {
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ ret = ocfs2_cancel_convert(osb, lockres);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ goto retry_cancel;
+ }
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+ ocfs2_wait_for_mask(&mw);
+ goto retry_cancel;
}
+ ret = -ERESTARTSYS;
+ /*
+ * We may still have gotten the lock, in which case there's no
+ * point to restarting the syscall.
+ */
+ if (lockres->l_level == level)
+ ret = 0;
+
+ mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+ lockres->l_flags, lockres->l_level, lockres->l_action);
+
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+out:
return ret;
}
-static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres)
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * separate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ * what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ * no-lock at unlock time. This also means flock locks never go on
+ * the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ * sure to allow cancellation of a misbehaving applications flock()
+ * request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ * can simplify the code by requiring the caller to guarantee
+ * serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
{
- int kick = 0;
+ int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
+ unsigned long flags;
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_lock_res *lockres = &fp->fp_flock;
+ struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+ struct ocfs2_mask_waiter mw;
+
+ ocfs2_init_mask_waiter(&mw);
+
+ if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+ (lockres->l_level > DLM_LOCK_NL)) {
+ mlog(ML_ERROR,
+ "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+ "level: %u\n", lockres->l_name, lockres->l_flags,
+ lockres->l_level);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+ /*
+ * Get the lock at NLMODE to start - that way we
+ * can cancel the upconvert request if need be.
+ */
+ ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_wait_for_mask(&mw);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ }
- mlog_entry_void();
+ lockres->l_action = OCFS2_AST_CONVERT;
+ lkm_flags |= DLM_LKF_CONVERT;
+ lockres->l_requested = level;
+ lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+ ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
+ lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
+ if (ret) {
+ if (!trylock || (ret != -EAGAIN)) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
+ ret = -EINVAL;
+ }
+
+ ocfs2_recover_from_dlm_error(lockres, 1);
+ lockres_remove_mask_waiter(lockres, &mw);
+ goto out;
+ }
+
+ ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+ if (ret == -ERESTARTSYS) {
+ /*
+ * Userspace can cause deadlock itself with
+ * flock(). Current behavior locally is to allow the
+ * deadlock, but abort the system call if a signal is
+ * received. We follow this example, otherwise a
+ * poorly written program could sit in kernel until
+ * reboot.
+ *
+ * Handling this is a bit more complicated for Ocfs2
+ * though. We can't exit this function with an
+ * outstanding lock request, so a cancel convert is
+ * required. We intentionally overwrite 'ret' - if the
+ * cancel fails and the lock was granted, it's easier
+ * to just bubble success back up to the user.
+ */
+ ret = ocfs2_flock_handle_signal(lockres, level);
+ } else if (!ret && (level > lockres->l_level)) {
+ /* Trylock failed asynchronously */
+ BUG_ON(!trylock);
+ ret = -EAGAIN;
+ }
+
+out:
+
+ mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+ lockres->l_name, ex, trylock, ret);
+ return ret;
+}
+
+void ocfs2_file_unlock(struct file *file)
+{
+ int ret;
+ unsigned int gen;
+ unsigned long flags;
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_lock_res *lockres = &fp->fp_flock;
+ struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+ struct ocfs2_mask_waiter mw;
+
+ ocfs2_init_mask_waiter(&mw);
+
+ if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
+ return;
+
+ if (lockres->l_level == DLM_LOCK_NL)
+ return;
+
+ mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+ lockres->l_name, lockres->l_flags, lockres->l_level,
+ lockres->l_action);
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ /*
+ * Fake a blocking ast for the downconvert code.
+ */
+ lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+ lockres->l_blocking = DLM_LOCK_EX;
+
+ gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
+ lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+ ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
+ if (ret) {
+ mlog_errno(ret);
+ return;
+ }
+
+ ret = ocfs2_wait_for_mask(&mw);
+ if (ret)
+ mlog_errno(ret);
+}
+
+static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
+{
+ int kick = 0;
/* If we know that another node is waiting on our lock, kick
- * the vote thread * pre-emptively when we reach a release
+ * the downconvert thread * pre-emptively when we reach a release
* condition. */
if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
switch(lockres->l_blocking) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
kick = 1;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
if (!lockres->l_ex_holders)
kick = 1;
break;
@@ -1258,27 +2001,7 @@ static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
}
if (kick)
- ocfs2_kick_vote_thread(osb);
-
- mlog_exit_void();
-}
-
-void ocfs2_data_unlock(struct inode *inode,
- int write)
-{
- int level = write ? LKM_EXMODE : LKM_PRMODE;
- struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
-
- mlog_entry_void();
-
- mlog(0, "inode %llu drop %s DATA lock\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- write ? "EXMODE" : "PRMODE");
-
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
- ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
- mlog_exit_void();
+ ocfs2_wake_downconvert_thread(osb);
}
#define OCFS2_SEC_BITS 34
@@ -1300,16 +2023,14 @@ static u64 ocfs2_pack_timespec(struct timespec *spec)
/* Call this with the lockres locked. I am reasonably sure we don't
* need ip_lock in this function as anyone who would be changing those
- * values is supposed to be blocked in ocfs2_meta_lock right now. */
+ * values is supposed to be blocked in ocfs2_inode_lock right now. */
static void __ocfs2_stuff_meta_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+ struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- mlog_entry_void();
-
- lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/*
* Invalidate the LVB of a deleted inode - this way other
@@ -1324,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_version = OCFS2_LVB_VERSION;
lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
- lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
- lvb->lvb_igid = cpu_to_be32(inode->i_gid);
+ lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode));
+ lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
lvb->lvb_iatime_packed =
@@ -1335,12 +2056,11 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_imtime_packed =
cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
+ lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
out:
mlog_meta_lvb(0, lockres);
-
- mlog_exit_void();
}
static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -1353,14 +2073,12 @@ static void ocfs2_unpack_timespec(struct timespec *spec,
static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
+ struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- mlog_entry_void();
-
mlog_meta_lvb(0, lockres);
- lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -1368,19 +2086,19 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
+ oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
ocfs2_set_inode_flags(inode);
/* fast-symlinks are a special case */
if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
inode->i_blocks = 0;
else
- inode->i_blocks =
- ocfs2_align_bytes_to_sectors(i_size_read(inode));
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
- inode->i_gid = be32_to_cpu(lvb->lvb_igid);
+ i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
+ i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
- inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
+ set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
@@ -1388,16 +2106,15 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
spin_unlock(&oi->ip_lock);
-
- mlog_exit_void();
}
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
- if (lvb->lvb_version == OCFS2_LVB_VERSION
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
+ && lvb->lvb_version == OCFS2_LVB_VERSION
&& be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
return 1;
return 0;
@@ -1415,8 +2132,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
unsigned long flags;
int status = 0;
- mlog_entry_void();
-
refresh_check:
spin_lock_irqsave(&lockres->l_lock, flags);
if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -1437,7 +2152,7 @@ refresh_check:
status = 1;
bail:
- mlog_exit(status);
+ mlog(0, "status %d\n", status);
return status;
}
@@ -1447,7 +2162,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
int status)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -1456,20 +2170,20 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
spin_unlock_irqrestore(&lockres->l_lock, flags);
wake_up(&lockres->l_event);
-
- mlog_exit_void();
}
/* may or may not return a bh if it went to disk. */
-static int ocfs2_meta_lock_update(struct inode *inode,
+static int ocfs2_inode_lock_update(struct inode *inode,
struct buffer_head **bh)
{
int status = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_lock_res *lockres;
+ struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_dinode *fe;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
+ if (ocfs2_mount_local(osb))
+ goto bail;
spin_lock(&oi->ip_lock);
if (oi->ip_flags & OCFS2_INODE_DELETED) {
@@ -1482,17 +2196,13 @@ static int ocfs2_meta_lock_update(struct inode *inode,
}
spin_unlock(&oi->ip_lock);
- lockres = &oi->ip_meta_lockres;
-
if (!ocfs2_should_refresh_lock_res(lockres))
goto bail;
/* This will discard any caching information we might have had
* for the inode metadata. */
- ocfs2_metadata_cache_purge(inode);
+ ocfs2_metadata_cache_purge(INODE_CACHE(inode));
- /* will do nothing for inode types that don't use the extent
- * map (directories, bitmap files, etc) */
ocfs2_extent_map_trunc(inode, 0);
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
@@ -1502,8 +2212,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
- bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
goto bail_refresh;
@@ -1511,18 +2220,14 @@ static int ocfs2_meta_lock_update(struct inode *inode,
fe = (struct ocfs2_dinode *) (*bh)->b_data;
/* This is a good chance to make sure we're not
- * locking an invalid object.
+ * locking an invalid object. ocfs2_read_inode_block()
+ * already checked that the inode block is sane.
*
* We bug on a stale inode here because we checked
* above whether it was wiped from disk. The wiping
* node provides a guarantee that we receive that
* message and can mark the inode before dropping any
* locks associated with it. */
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail_refresh;
- }
mlog_bug_on_msg(inode->i_generation !=
le32_to_cpu(fe->i_generation),
"Invalid dinode %llu disk generation: %u "
@@ -1538,13 +2243,13 @@ static int ocfs2_meta_lock_update(struct inode *inode,
le32_to_cpu(fe->i_flags));
ocfs2_refresh_inode(inode, fe);
+ ocfs2_track_lock_refresh(lockres);
}
status = 0;
bail_refresh:
ocfs2_complete_lock_res_refresh(lockres, status);
bail:
- mlog_exit(status);
return status;
}
@@ -1563,11 +2268,7 @@ static int ocfs2_assign_bh(struct inode *inode,
return 0;
}
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- ret_bh,
- OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_inode_block(inode, ret_bh);
if (status < 0)
mlog_errno(status);
@@ -1578,21 +2279,20 @@ static int ocfs2_assign_bh(struct inode *inode,
* returns < 0 error if the callback will never be called, otherwise
* the result of the lock will be communicated via the callback.
*/
-int ocfs2_meta_lock_full(struct inode *inode,
- struct ocfs2_journal_handle *handle,
- struct buffer_head **ret_bh,
- int ex,
- int arg_flags)
-{
- int status, level, dlm_flags, acquired;
- struct ocfs2_lock_res *lockres;
+int ocfs2_inode_lock_full_nested(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ int arg_flags,
+ int subclass)
+{
+ int status, level, acquired;
+ u32 dlm_flags;
+ struct ocfs2_lock_res *lockres = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *local_bh = NULL;
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu, take %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -1604,23 +2304,25 @@ int ocfs2_meta_lock_full(struct inode *inode,
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
status = -EROFS;
- goto bail;
+ goto getbh;
}
+ if (ocfs2_mount_local(osb))
+ goto local;
+
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
- wait_event(osb->recovery_event,
- ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ ocfs2_wait_for_recovery(osb);
- acquired = 0;
- lockres = &OCFS2_I(inode)->ip_meta_lockres;
- level = ex ? LKM_EXMODE : LKM_PRMODE;
+ lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
dlm_flags = 0;
if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
- dlm_flags |= LKM_NOQUEUE;
+ dlm_flags |= DLM_LKF_NOQUEUE;
- status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+ status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
+ arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
@@ -1633,9 +2335,9 @@ int ocfs2_meta_lock_full(struct inode *inode,
* committed to owning this lock so we don't allow signals to
* abort the operation. */
if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
- wait_event(osb->recovery_event,
- ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ ocfs2_wait_for_recovery(osb);
+local:
/*
* We only see this flag if we're being called from
* ocfs2_read_locked_inode(). It means we're locking an inode
@@ -1644,22 +2346,23 @@ int ocfs2_meta_lock_full(struct inode *inode,
*/
if (inode->i_state & I_NEW) {
status = 0;
- ocfs2_complete_lock_res_refresh(lockres, 0);
+ if (lockres)
+ ocfs2_complete_lock_res_refresh(lockres, 0);
goto bail;
}
/* This is fun. The caller may want a bh back, or it may
- * not. ocfs2_meta_lock_update definitely wants one in, but
+ * not. ocfs2_inode_lock_update definitely wants one in, but
* may or may not read one, depending on what's in the
* LVB. The result of all of this is that we've *only* gone to
* disk if we have to, so the complexity is worthwhile. */
- status = ocfs2_meta_lock_update(inode, &local_bh);
+ status = ocfs2_inode_lock_update(inode, &local_bh);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
-
+getbh:
if (ret_bh) {
status = ocfs2_assign_bh(inode, ret_bh, local_bh);
if (status < 0) {
@@ -1668,12 +2371,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
}
}
- if (handle) {
- status = ocfs2_handle_add_lock(handle, inode);
- if (status < 0)
- mlog_errno(status);
- }
-
bail:
if (status < 0) {
if (ret_bh && (*ret_bh)) {
@@ -1681,30 +2378,30 @@ bail:
*ret_bh = NULL;
}
if (acquired)
- ocfs2_meta_unlock(inode, ex);
+ ocfs2_inode_unlock(inode, ex);
}
if (local_bh)
brelse(local_bh);
- mlog_exit(status);
return status;
}
/*
- * This is working around a lock inversion between tasks acquiring DLM locks
- * while holding a page lock and the vote thread which blocks dlm lock acquiry
- * while acquiring page locks.
+ * This is working around a lock inversion between tasks acquiring DLM
+ * locks while holding a page lock and the downconvert thread which
+ * blocks dlm lock acquiry while acquiring page locks.
*
* ** These _with_page variantes are only intended to be called from aop
* methods that hold page locks and return a very specific *positive* error
* code that aop methods pass up to the VFS -- test for errors with != 0. **
*
- * The DLM is called such that it returns -EAGAIN if it would have blocked
- * waiting for the vote thread. In that case we unlock our page so the vote
- * thread can make progress. Once we've done this we have to return
- * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
- * into the VFS who will then immediately retry the aop call.
+ * The DLM is called such that it returns -EAGAIN if it would have
+ * blocked waiting for the downconvert thread. In that case we unlock
+ * our page so the downconvert thread can make progress. Once we've
+ * done this we have to return AOP_TRUNCATED_PAGE so the aop method
+ * that called us can bubble that back up into the VFS who will then
+ * immediately retry the aop call.
*
* We do a blocking lock and immediate unlock before returning, though, so that
* the lock has a great chance of being cached on this node by the time the VFS
@@ -1712,58 +2409,130 @@ bail:
* ping locks back and forth, but that's a risk we're willing to take to avoid
* the lock inversion simply.
*/
-int ocfs2_meta_lock_with_page(struct inode *inode,
- struct ocfs2_journal_handle *handle,
+int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page)
{
int ret;
- ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
- OCFS2_LOCK_NONBLOCK);
+ ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
- if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
- ocfs2_meta_unlock(inode, ex);
+ if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+ ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
return ret;
}
-void ocfs2_meta_unlock(struct inode *inode,
- int ex)
+int ocfs2_inode_lock_atime(struct inode *inode,
+ struct vfsmount *vfsmnt,
+ int *level)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
- struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
+ int ret;
+
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /*
+ * If we should update atime, we will get EX lock,
+ * otherwise we just get PR lock.
+ */
+ if (ocfs2_should_update_atime(inode, vfsmnt)) {
+ struct buffer_head *bh = NULL;
+
+ ocfs2_inode_unlock(inode, 0);
+ ret = ocfs2_inode_lock(inode, &bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ *level = 1;
+ if (ocfs2_should_update_atime(inode, vfsmnt))
+ ocfs2_update_inode_atime(inode, bh);
+ if (bh)
+ brelse(bh);
+ } else
+ *level = 0;
- mlog_entry_void();
+ return ret;
+}
+
+void ocfs2_inode_unlock(struct inode *inode,
+ int ex)
+{
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog(0, "inode %llu drop %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
- if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
+ if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
+ !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
+
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int status = 0;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+ if (status < 0)
+ return status;
+
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+ *seqno = be32_to_cpu(lvb->lvb_os_seqno);
+ else
+ *seqno = osb->osb_orphan_scan.os_seqno + 1;
+
+ return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
- mlog_exit_void();
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+ lvb->lvb_os_seqno = cpu_to_be32(seqno);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+ }
}
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex)
{
- int status;
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int status = 0;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
- struct buffer_head *bh;
- struct ocfs2_slot_info *si = osb->slot_info;
-
- mlog_entry_void();
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ocfs2_mount_local(osb))
+ goto bail;
+
status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
if (status < 0) {
mlog_errno(status);
@@ -1775,34 +2544,29 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* refreshed, so we do it here. Of course, making sense of
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
if (status) {
- bh = si->si_bh;
- status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
- si->si_inode);
- if (status == 0)
- ocfs2_update_slot_info(si);
+ status = ocfs2_refresh_slot_info(osb);
ocfs2_complete_lock_res_refresh(lockres, status);
- if (status < 0)
+ if (status < 0) {
+ ocfs2_cluster_unlock(osb, lockres, level);
mlog_errno(status);
+ }
+ ocfs2_track_lock_refresh(lockres);
}
bail:
- mlog_exit(status);
return status;
}
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
- ocfs2_cluster_unlock(osb, lockres, level);
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
}
int ocfs2_rename_lock(struct ocfs2_super *osb)
@@ -1813,7 +2577,10 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
- status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1824,20 +2591,55 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
- ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+}
+
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+ int status;
+ struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+ 0, 0);
+ if (status < 0)
+ mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+
+ return status;
+}
+
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres,
+ ex ? LKM_EXMODE : LKM_PRMODE);
}
int ocfs2_dentry_lock(struct dentry *dentry, int ex)
{
int ret;
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
BUG_ON(!dl);
- if (ocfs2_is_hard_readonly(osb))
- return -EROFS;
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ return -EROFS;
+ return 0;
+ }
+
+ if (ocfs2_mount_local(osb))
+ return 0;
ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
if (ret < 0)
@@ -1848,11 +2650,12 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
{
- int level = ex ? LKM_EXMODE : LKM_PRMODE;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
}
/* Reference counting of the dlm debug structure. We want this because
@@ -1974,8 +2777,15 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
return iter;
}
-/* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 1
+/*
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ * - Lock stats printed
+ * New in version 3
+ * - Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
@@ -2012,16 +2822,57 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
lockres->l_blocking);
/* Dump the raw LVB */
- lvb = lockres->l_lksb.lvb;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
for(i = 0; i < DLM_LVB_LEN; i++)
seq_printf(m, "0x%x\t", lvb[i]);
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets)
+# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets)
+# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail)
+# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail)
+# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total)
+# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total)
+# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
+# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
+# define lock_refresh(_l) ((_l)->l_lock_refresh)
+#else
+# define lock_num_prmode(_l) (0)
+# define lock_num_exmode(_l) (0)
+# define lock_num_prmode_failed(_l) (0)
+# define lock_num_exmode_failed(_l) (0)
+# define lock_total_prmode(_l) (0ULL)
+# define lock_total_exmode(_l) (0ULL)
+# define lock_max_prmode(_l) (0)
+# define lock_max_exmode(_l) (0)
+# define lock_refresh(_l) (0)
+#endif
+ /* The following seq_print was added in version 2 of this output */
+ seq_printf(m, "%u\t"
+ "%u\t"
+ "%u\t"
+ "%u\t"
+ "%llu\t"
+ "%llu\t"
+ "%u\t"
+ "%u\t"
+ "%u\t",
+ lock_num_prmode(lockres),
+ lock_num_exmode(lockres),
+ lock_num_prmode_failed(lockres),
+ lock_num_exmode_failed(lockres),
+ lock_total_prmode(lockres),
+ lock_total_exmode(lockres),
+ lock_max_prmode(lockres),
+ lock_max_exmode(lockres),
+ lock_refresh(lockres));
+
/* End the line */
seq_printf(m, "\n");
return 0;
}
-static struct seq_operations ocfs2_dlm_seq_ops = {
+static const struct seq_operations ocfs2_dlm_seq_ops = {
.start = ocfs2_dlm_seq_start,
.stop = ocfs2_dlm_seq_stop,
.next = ocfs2_dlm_seq_next,
@@ -2030,7 +2881,7 @@ static struct seq_operations ocfs2_dlm_seq_ops = {
static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
{
- struct seq_file *seq = (struct seq_file *) file->private_data;
+ struct seq_file *seq = file->private_data;
struct ocfs2_dlm_seq_priv *priv = seq->private;
struct ocfs2_lock_res *res = &priv->p_iter_res;
@@ -2064,7 +2915,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
goto out;
}
- seq = (struct seq_file *) file->private_data;
+ seq = file->private_data;
seq->private = priv;
ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -2115,11 +2966,13 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
int ocfs2_dlm_init(struct ocfs2_super *osb)
{
- int status;
- u32 dlm_key;
- struct dlm_ctxt *dlm;
+ int status = 0;
+ struct ocfs2_cluster_connection *conn = NULL;
- mlog_entry_void();
+ if (ocfs2_mount_local(osb)) {
+ osb->node_num = 0;
+ goto local;
+ }
status = ocfs2_dlm_init_debug(osb);
if (status < 0) {
@@ -2127,139 +2980,96 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- /* launch vote thread */
- osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
- if (IS_ERR(osb->vote_task)) {
- status = PTR_ERR(osb->vote_task);
- osb->vote_task = NULL;
+ /* launch downconvert thread */
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ if (IS_ERR(osb->dc_task)) {
+ status = PTR_ERR(osb->dc_task);
+ osb->dc_task = NULL;
mlog_errno(status);
goto bail;
}
- /* used by the dlm code to make message headers unique, each
- * node in this domain must agree on this. */
- dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
-
/* for now, uuid == domain */
- dlm = dlm_register_domain(osb->uuid_str, dlm_key);
- if (IS_ERR(dlm)) {
- status = PTR_ERR(dlm);
+ status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
+ osb->uuid_str,
+ strlen(osb->uuid_str),
+ &lproto, ocfs2_do_node_down, osb,
+ &conn);
+ if (status) {
mlog_errno(status);
goto bail;
}
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
+ if (status < 0) {
+ mlog_errno(status);
+ mlog(ML_ERROR,
+ "could not find this host's node number\n");
+ ocfs2_cluster_disconnect(conn, 0);
+ goto bail;
+ }
+
+local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
- dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
-
- osb->dlm = dlm;
+ osb->cconn = conn;
status = 0;
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
- if (osb->vote_task)
- kthread_stop(osb->vote_task);
+ if (osb->dc_task)
+ kthread_stop(osb->dc_task);
}
- mlog_exit(status);
return status;
}
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+ int hangup_pending)
{
- mlog_entry_void();
-
- dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
-
ocfs2_drop_osb_locks(osb);
- if (osb->vote_task) {
- kthread_stop(osb->vote_task);
- osb->vote_task = NULL;
+ /*
+ * Now that we have dropped all locks and ocfs2_dismount_volume()
+ * has disabled recovery, the DLM won't be talking to us. It's
+ * safe to tear things down before disconnecting the cluster.
+ */
+
+ if (osb->dc_task) {
+ kthread_stop(osb->dc_task);
+ osb->dc_task = NULL;
}
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
+ ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+ ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- dlm_unregister_domain(osb->dlm);
- osb->dlm = NULL;
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
ocfs2_dlm_shutdown_debug(osb);
-
- mlog_exit_void();
-}
-
-static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
-{
- struct ocfs2_lock_res *lockres = opaque;
- unsigned long flags;
-
- mlog_entry_void();
-
- mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
- lockres->l_unlock_action);
-
- spin_lock_irqsave(&lockres->l_lock, flags);
- /* We tried to cancel a convert request, but it was already
- * granted. All we want to do here is clear our unlock
- * state. The wake_up call done at the bottom is redundant
- * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
- * hurt anything anyway */
- if (status == DLM_CANCELGRANT &&
- lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
- mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
-
- /* We don't clear the busy flag in this case as it
- * should have been cleared by the ast which the dlm
- * has called. */
- goto complete_unlock;
- }
-
- if (status != DLM_NORMAL) {
- mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
- "unlock_action %d\n", status, lockres->l_name,
- lockres->l_unlock_action);
- spin_unlock_irqrestore(&lockres->l_lock, flags);
- return;
- }
-
- switch(lockres->l_unlock_action) {
- case OCFS2_UNLOCK_CANCEL_CONVERT:
- mlog(0, "Cancel convert success for %s\n", lockres->l_name);
- lockres->l_action = OCFS2_AST_INVALID;
- break;
- case OCFS2_UNLOCK_DROP_LOCK:
- lockres->l_level = LKM_IVMODE;
- break;
- default:
- BUG();
- }
-
- lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-complete_unlock:
- lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
- spin_unlock_irqrestore(&lockres->l_lock, flags);
-
- wake_up(&lockres->l_event);
-
- mlog_exit_void();
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
- enum dlm_status status;
+ int ret;
unsigned long flags;
- int lkm_flags = 0;
+ u32 lkm_flags = 0;
/* We didn't get anywhere near actually using this lockres. */
if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
goto out;
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
- lkm_flags |= LKM_VALBLK;
+ lkm_flags |= DLM_LKF_VALBLK;
spin_lock_irqsave(&lockres->l_lock, flags);
@@ -2285,7 +3095,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
- lockres->l_level == LKM_EXMODE &&
+ lockres->l_level == DLM_LOCK_EX &&
!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
lockres->l_ops->set_lvb(lockres);
}
@@ -2314,39 +3124,75 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
mlog(0, "lock %s\n", lockres->l_name);
- status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
- ocfs2_unlock_ast, lockres);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmunlock", status, lockres);
+ ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
- dlm_print_one_lock(lockres->l_lksb.lockid);
+ ocfs2_dlm_dump_lksb(&lockres->l_lksb);
BUG();
}
- mlog(0, "lock %s, successfull return from dlmunlock\n",
+ mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
lockres->l_name);
ocfs2_wait_on_busy_lock(lockres);
out:
- mlog_exit(0);
return 0;
}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
- * being dequeued from the vote thread before we can consider
- * it safe to drop.
+ * being dequeued from the downconvert thread before we can consider
+ * it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_mask_waiter mw;
- unsigned long flags;
+ unsigned long flags, flags2;
ocfs2_init_mask_waiter(&mw);
spin_lock_irqsave(&lockres->l_lock, flags);
lockres->l_flags |= OCFS2_LOCK_FREEING;
+ if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+ /*
+ * We know the downconvert is queued but not in progress
+ * because we are the downconvert thread and processing
+ * different lock. So we can just remove the lock from the
+ * queue. This is not only an optimization but also a way
+ * to avoid the following deadlock:
+ * ocfs2_dentry_post_unlock()
+ * ocfs2_dentry_lock_put()
+ * ocfs2_drop_dentry_lock()
+ * iput()
+ * ocfs2_evict_inode()
+ * ocfs2_clear_inode()
+ * ocfs2_mark_lockres_freeing()
+ * ... blocks waiting for OCFS2_LOCK_QUEUED
+ * since we are the downconvert thread which
+ * should clear the flag.
+ */
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ spin_lock_irqsave(&osb->dc_task_lock, flags2);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+ /*
+ * Warn if we recurse into another post_unlock call. Strictly
+ * speaking it isn't a problem but we need to be careful if
+ * that happens (stack overflow, deadlocks, ...) so warn if
+ * ocfs2 grows a path for which this can happen.
+ */
+ WARN_ON_ONCE(lockres->l_ops->post_unlock);
+ /* Since the lock is freeing we don't do much in the fn below */
+ ocfs2_process_blocked_lock(osb, lockres);
+ return;
+ }
while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2367,7 +3213,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
{
int ret;
- ocfs2_mark_lockres_freeing(lockres);
+ ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
@@ -2377,26 +3223,26 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
{
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
{
int status, err;
- mlog_entry_void();
-
/* No need to call ocfs2_mark_lockres_freeing here -
* ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_data_lockres);
+ &OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);
status = err;
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_meta_lockres);
+ &OCFS2_I(inode)->ip_inode_lockres);
if (err < 0)
mlog_errno(err);
if (err < 0 && !status)
@@ -2409,81 +3255,82 @@ int ocfs2_drop_inode_locks(struct inode *inode)
if (err < 0 && !status)
status = err;
- mlog_exit(status);
return status;
}
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
- int new_level)
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
{
assert_spin_locked(&lockres->l_lock);
- BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+ BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
if (lockres->l_level <= new_level) {
- mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
- lockres->l_level, new_level);
+ mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
+ "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+ "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+ new_level, list_empty(&lockres->l_blocked_list),
+ list_empty(&lockres->l_mask_waiters), lockres->l_type,
+ lockres->l_flags, lockres->l_ro_holders,
+ lockres->l_ex_holders, lockres->l_action,
+ lockres->l_unlock_action, lockres->l_requested,
+ lockres->l_blocking, lockres->l_pending_gen);
BUG();
}
- mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
- lockres->l_name, new_level, lockres->l_blocking);
+ mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
+ lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
lockres->l_action = OCFS2_AST_DOWNCONVERT;
lockres->l_requested = new_level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+ return lockres_set_pending(lockres);
}
static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
int new_level,
- int lvb)
+ int lvb,
+ unsigned int generation)
{
- int ret, dlm_flags = LKM_CONVERT;
- enum dlm_status status;
+ int ret;
+ u32 dlm_flags = DLM_LKF_CONVERT;
- mlog_entry_void();
+ mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+ lockres->l_level, new_level);
if (lvb)
- dlm_flags |= LKM_VALBLK;
-
- status = dlmlock(osb->dlm,
- new_level,
- &lockres->l_lksb,
- dlm_flags,
- lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- ocfs2_locking_ast,
- lockres,
- ocfs2_blocking_ast);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmlock", status, lockres);
- ret = -EINVAL;
+ dlm_flags |= DLM_LKF_VALBLK;
+
+ ret = ocfs2_dlm_lock(osb->cconn,
+ new_level,
+ &lockres->l_lksb,
+ dlm_flags,
+ lockres->l_name,
+ OCFS2_LOCK_ID_MAX_LEN - 1);
+ lockres_clear_pending(lockres, generation, osb);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 1);
goto bail;
}
ret = 0;
bail:
- mlog_exit(ret);
return ret;
}
-/* returns 1 when the caller should unlock and call dlmunlock */
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
assert_spin_locked(&lockres->l_lock);
- mlog_entry_void();
- mlog(0, "lock %s\n", lockres->l_name);
-
if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
/* If we're already trying to cancel a lock conversion
* then just drop the spinlock and allow the caller to
* requeue this lock. */
-
- mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+ mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
return 0;
}
@@ -2498,6 +3345,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
"lock %s, invalid flags: 0x%lx\n",
lockres->l_name, lockres->l_flags);
+ mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
return 1;
}
@@ -2505,26 +3354,16 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
int ret;
- enum dlm_status status;
- mlog_entry_void();
- mlog(0, "lock %s\n", lockres->l_name);
-
- ret = 0;
- status = dlmunlock(osb->dlm,
- &lockres->l_lksb,
- LKM_CANCEL,
- ocfs2_unlock_ast,
- lockres);
- if (status != DLM_NORMAL) {
- ocfs2_log_dlm_error("dlmunlock", status, lockres);
- ret = -EINVAL;
+ ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
+ DLM_LKF_CANCEL);
+ if (ret) {
+ ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 0);
}
- mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+ mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
- mlog_exit(ret);
return ret;
}
@@ -2535,17 +3374,54 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
unsigned long flags;
int blocking;
int new_level;
+ int level;
int ret = 0;
int set_lvb = 0;
-
- mlog_entry_void();
+ unsigned int gen;
spin_lock_irqsave(&lockres->l_lock, flags);
- BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-
recheck:
+ /*
+ * Is it still blocking? If not, we have no more work to do.
+ */
+ if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+ BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ ret = 0;
+ goto leave;
+ }
+
if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+ /* XXX
+ * This is a *big* race. The OCFS2_LOCK_PENDING flag
+ * exists entirely for one reason - another thread has set
+ * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+ *
+ * If we do ocfs2_cancel_convert() before the other thread
+ * calls dlm_lock(), our cancel will do nothing. We will
+ * get no ast, and we will have no way of knowing the
+ * cancel failed. Meanwhile, the other thread will call
+ * into dlm_lock() and wait...forever.
+ *
+ * Why forever? Because another node has asked for the
+ * lock first; that's why we're here in unblock_lock().
+ *
+ * The solution is OCFS2_LOCK_PENDING. When PENDING is
+ * set, we just requeue the unblock. Only when the other
+ * thread has called dlm_lock() and cleared PENDING will
+ * we then cancel their request.
+ *
+ * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+ * at the same time they set OCFS2_DLM_BUSY. They must
+ * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+ mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+ lockres->l_name);
+ goto leave_requeue;
+ }
+
ctl->requeue = 1;
ret = ocfs2_prepare_cancel_convert(osb, lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2557,31 +3433,70 @@ recheck:
goto leave;
}
+ /*
+ * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
+ * set when the ast is received for an upconvert just before the
+ * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
+ * on the heels of the ast, we want to delay the downconvert just
+ * enough to allow the up requestor to do its task. Because this
+ * lock is in the blocked queue, the lock will be downconverted
+ * as soon as the requestor is done with the lock.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
+ goto leave_requeue;
+
+ /*
+ * How can we block and yet be at NL? We were trying to upconvert
+ * from NL and got canceled. The code comes back here, and now
+ * we notice and clear BLOCKING.
+ */
+ if (lockres->l_level == DLM_LOCK_NL) {
+ BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+ mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
+ lockres->l_blocking = DLM_LOCK_NL;
+ lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ goto leave;
+ }
+
/* if we're blocking an exclusive and we have *any* holders,
* then requeue. */
- if ((lockres->l_blocking == LKM_EXMODE)
- && (lockres->l_ex_holders || lockres->l_ro_holders))
+ if ((lockres->l_blocking == DLM_LOCK_EX)
+ && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+ mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+ lockres->l_name, lockres->l_ex_holders,
+ lockres->l_ro_holders);
goto leave_requeue;
+ }
/* If it's a PR we're blocking, then only
* requeue if we've got any EX holders */
- if (lockres->l_blocking == LKM_PRMODE &&
- lockres->l_ex_holders)
+ if (lockres->l_blocking == DLM_LOCK_PR &&
+ lockres->l_ex_holders) {
+ mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+ lockres->l_name, lockres->l_ex_holders);
goto leave_requeue;
+ }
/*
* Can we get a lock in this state if the holder counts are
* zero? The meta data unblock code used to check this.
*/
if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
- && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
+ && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+ mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+ lockres->l_name);
goto leave_requeue;
+ }
new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
if (lockres->l_ops->check_downconvert
- && !lockres->l_ops->check_downconvert(lockres, new_level))
+ && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+ mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+ lockres->l_name);
goto leave_requeue;
+ }
/* If we get here, then we know that there are no more
* incompatible holders (and anyone asking for an incompatible
@@ -2594,17 +3509,24 @@ recheck:
* may sleep, so we save off a copy of what we're blocking as
* it may change while we're not holding the spin lock. */
blocking = lockres->l_blocking;
+ level = lockres->l_level;
spin_unlock_irqrestore(&lockres->l_lock, flags);
ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
- if (ctl->unblock_action == UNBLOCK_STOP_POST)
+ if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+ mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+ lockres->l_name);
goto leave;
+ }
spin_lock_irqsave(&lockres->l_lock, flags);
- if (blocking != lockres->l_blocking) {
+ if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
/* If this changed underneath us, then we can't drop
* it just yet. */
+ mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+ "Recheck\n", lockres->l_name, blocking,
+ lockres->l_blocking, level, lockres->l_level);
goto recheck;
}
@@ -2612,7 +3534,7 @@ downconvert:
ctl->requeue = 0;
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
- if (lockres->l_level == LKM_EXMODE)
+ if (lockres->l_level == DLM_LOCK_EX)
set_lvb = 1;
/*
@@ -2625,18 +3547,20 @@ downconvert:
lockres->l_ops->set_lvb(lockres);
}
- ocfs2_prepare_downconvert(lockres, new_level);
+ gen = ocfs2_prepare_downconvert(lockres, new_level);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+ ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+ gen);
+
leave:
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
leave_requeue:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ctl->requeue = 1;
- mlog_exit(0);
return 0;
}
@@ -2645,18 +3569,37 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
{
struct inode *inode;
struct address_space *mapping;
+ struct ocfs2_inode_info *oi;
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
+ if (S_ISDIR(inode->i_mode)) {
+ oi = OCFS2_I(inode);
+ oi->ip_dir_lock_gen++;
+ mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+ goto out;
+ }
+
+ if (!S_ISREG(inode->i_mode))
+ goto out;
+
+ /*
+ * We need this before the filemap_fdatawrite() so that it can
+ * transfer the dirty bit from the PTE to the
+ * page. Unfortunately this means that even for EX->PR
+ * downconverts, we'll lose our mappings and have to build
+ * them up again.
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+
if (filemap_fdatawrite(mapping)) {
mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
}
sync_mapping_buffers(mapping);
- if (blocking == LKM_EXMODE) {
+ if (blocking == DLM_LOCK_EX) {
truncate_inode_pages(mapping, 0);
- unmap_mapping_range(mapping, 0, 0, 0);
} else {
/* We only need to wait on the I/O if we're not also
* truncating pages because truncate_inode_pages waits
@@ -2666,25 +3609,34 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
filemap_fdatawait(mapping);
}
+out:
return UNBLOCK_CONTINUE;
}
-static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
- int new_level)
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+ struct ocfs2_lock_res *lockres,
+ int new_level)
{
- struct inode *inode = ocfs2_lock_res_inode(lockres);
- int checkpointed = ocfs2_inode_fully_checkpointed(inode);
+ int checkpointed = ocfs2_ci_fully_checkpointed(ci);
- BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
- BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
+ BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
+ BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
if (checkpointed)
return 1;
- ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
+ ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
return 0;
}
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
+{
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+ return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
{
struct inode *inode = ocfs2_lock_res_inode(lockres);
@@ -2694,7 +3646,7 @@ static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
/*
* Does the final reference drop on our dentry lock. Right now this
- * happens in the vote thread, but we could choose to simplify the
+ * happens in the downconvert thread, but we could choose to simplify the
* dlmglue API and push these off to the ocfs2_wq in the future.
*/
static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
@@ -2740,7 +3692,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
* valid. The downconvert code will retain a PR for this node,
* so there's no further work to do.
*/
- if (blocking == LKM_PRMODE)
+ if (blocking == DLM_LOCK_PR)
return UNBLOCK_CONTINUE;
/*
@@ -2814,8 +3766,163 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
return UNBLOCK_CONTINUE_POST;
}
-void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres)
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
+{
+ struct ocfs2_refcount_tree *tree =
+ ocfs2_lock_res_refcount_tree(lockres);
+
+ return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+ int blocking)
+{
+ struct ocfs2_refcount_tree *tree =
+ ocfs2_lock_res_refcount_tree(lockres);
+
+ ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+ return UNBLOCK_CONTINUE;
+}
+
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_qinfo_lvb *lvb;
+ struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+ struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+ oinfo->dqi_gi.dqi_type);
+
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+ lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+ lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+ lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+ lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+ lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+ lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+ struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+ oinfo->dqi_gi.dqi_type);
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_global_disk_dqinfo *gdinfo;
+ int status = 0;
+
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+ info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+ info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+ oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+ oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+ oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+ oinfo->dqi_gi.dqi_free_entry =
+ be32_to_cpu(lvb->lvb_free_entry);
+ } else {
+ status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+ oinfo->dqi_giblk, &bh);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ gdinfo = (struct ocfs2_global_disk_dqinfo *)
+ (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+ info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+ oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+ oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+ oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+ oinfo->dqi_gi.dqi_free_entry =
+ le32_to_cpu(gdinfo->dqi_free_entry);
+ brelse(bh);
+ ocfs2_track_lock_refresh(lockres);
+ }
+
+bail:
+ return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ int status = 0;
+
+ /* On RO devices, locking really isn't needed... */
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ status = -EROFS;
+ goto bail;
+ }
+ if (ocfs2_mount_local(osb))
+ goto bail;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ if (!ocfs2_should_refresh_lock_res(lockres))
+ goto bail;
+ /* OK, we have the lock but we need to refresh the quota info */
+ status = ocfs2_refresh_qinfo(oinfo);
+ if (status)
+ ocfs2_qinfo_unlock(oinfo, ex);
+ ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+ return status;
+}
+
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+ int status;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+ struct ocfs2_super *osb = lockres->l_priv;
+
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+ return status;
+}
+
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+ struct ocfs2_super *osb = lockres->l_priv;
+
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
+
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_unblock_ctl ctl = {0, 0,};
@@ -2825,15 +3932,13 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
* considered valid until we remove the OCFS2_LOCK_QUEUED
* flag. */
- mlog_entry_void();
-
BUG_ON(!lockres);
BUG_ON(!lockres->l_ops);
- mlog(0, "lockres %s blocked.\n", lockres->l_name);
+ mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
/* Detect whether a lock has been marked as going away while
- * the vote thread was processing other things. A lock can
+ * the downconvert thread was processing other things. A lock can
* still be marked with OCFS2_LOCK_FREEING after this check,
* but short circuiting here will still save us some
* performance. */
@@ -2853,21 +3958,19 @@ unqueue:
} else
ocfs2_schedule_blocked_lock(osb, lockres);
- mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+ mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
ctl.requeue ? "yes" : "no");
spin_unlock_irqrestore(&lockres->l_lock, flags);
if (ctl.unblock_action != UNBLOCK_CONTINUE
&& lockres->l_ops->post_unlock)
lockres->l_ops->post_unlock(osb, lockres);
-
- mlog_exit_void();
}
static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
+ unsigned long flags;
assert_spin_locked(&lockres->l_lock);
@@ -2875,45 +3978,110 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
/* Do not schedule a lock for downconvert when it's on
* the way to destruction - any nodes wanting access
* to the resource will get it soon. */
- mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+ mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
lockres->l_name, lockres->l_flags);
return;
}
lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
- spin_lock(&osb->vote_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (list_empty(&lockres->l_blocked_list)) {
list_add_tail(&lockres->l_blocked_list,
&osb->blocked_lock_list);
osb->blocked_lock_count++;
}
- spin_unlock(&osb->vote_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+}
+
+static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
+{
+ unsigned long processed;
+ unsigned long flags;
+ struct ocfs2_lock_res *lockres;
+
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
+ /* grab this early so we know to try again if a state change and
+ * wake happens part-way through our work */
+ osb->dc_work_sequence = osb->dc_wake_sequence;
- mlog_exit_void();
+ processed = osb->blocked_lock_count;
+ while (processed) {
+ BUG_ON(list_empty(&osb->blocked_lock_list));
+
+ lockres = list_entry(osb->blocked_lock_list.next,
+ struct ocfs2_lock_res, l_blocked_list);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+
+ BUG_ON(!processed);
+ processed--;
+
+ ocfs2_process_blocked_lock(osb, lockres);
+
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
+ }
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
}
-/* This aids in debugging situations where a bad LVB might be involved. */
-void ocfs2_dump_meta_lvb_info(u64 level,
- const char *function,
- unsigned int line,
- struct ocfs2_lock_res *lockres)
+static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
{
- struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ int empty = 0;
+ unsigned long flags;
- mlog(level, "LVB information for %s (called from %s:%u):\n",
- lockres->l_name, function, line);
- mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
- lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
- be32_to_cpu(lvb->lvb_igeneration));
- mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
- (unsigned long long)be64_to_cpu(lvb->lvb_isize),
- be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
- be16_to_cpu(lvb->lvb_imode));
- mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
- "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
- (long long)be64_to_cpu(lvb->lvb_iatime_packed),
- (long long)be64_to_cpu(lvb->lvb_ictime_packed),
- (long long)be64_to_cpu(lvb->lvb_imtime_packed),
- be32_to_cpu(lvb->lvb_iattr));
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
+ if (list_empty(&osb->blocked_lock_list))
+ empty = 1;
+
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+ return empty;
+}
+
+static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
+{
+ int should_wake = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
+ if (osb->dc_work_sequence != osb->dc_wake_sequence)
+ should_wake = 1;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+
+ return should_wake;
+}
+
+static int ocfs2_downconvert_thread(void *arg)
+{
+ int status = 0;
+ struct ocfs2_super *osb = arg;
+
+ /* only quit once we've been asked to stop and there is no more
+ * work available */
+ while (!(kthread_should_stop() &&
+ ocfs2_downconvert_thread_lists_empty(osb))) {
+
+ wait_event_interruptible(osb->dc_event,
+ ocfs2_downconvert_thread_should_wake(osb) ||
+ kthread_should_stop());
+
+ mlog(0, "downconvert_thread: awoken\n");
+
+ ocfs2_downconvert_thread_do_work(osb);
+ }
+
+ osb->dc_task = NULL;
+ return status;
+}
+
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
+ /* make sure the voting thread gets a swipe at whatever changes
+ * the caller may have made to the voting state */
+ osb->dc_wake_sequence++;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
+ wake_up(&osb->dc_event);
}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 4a276938722..d293a22c32c 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -29,12 +29,12 @@
#include "dcache.h"
-#define OCFS2_LVB_VERSION 4
+#define OCFS2_LVB_VERSION 5
struct ocfs2_meta_lvb {
__u8 lvb_version;
__u8 lvb_reserved0;
- __be16 lvb_reserved1;
+ __be16 lvb_idynfeatures;
__be32 lvb_iclusters;
__be32 lvb_iuid;
__be32 lvb_igid;
@@ -49,16 +49,46 @@ struct ocfs2_meta_lvb {
__be32 lvb_reserved2;
};
-/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_bgrace;
+ __be32 lvb_igrace;
+ __be32 lvb_syncms;
+ __be32 lvb_blocks;
+ __be32 lvb_free_blk;
+ __be32 lvb_free_entry;
+};
+
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_os_seqno;
+};
+
+/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
/* Instruct the dlm not to queue ourselves on the other node. */
#define OCFS2_META_LOCK_NOQUEUE (0x02)
-/* don't block waiting for the vote thread, instead return -EAGAIN */
+/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
+/* Locking subclasses of inode cluster lock */
+enum {
+ OI_LS_NORMAL = 0,
+ OI_LS_PARENT,
+ OI_LS_RENAME1,
+ OI_LS_RENAME2,
+ OI_LS_REFLINK_TARGET,
+};
+
int ocfs2_dlm_init(struct ocfs2_super *osb);
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
@@ -66,62 +96,78 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
struct inode *inode);
void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_super *osb, u64 ref_blkno,
+ unsigned int generation);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
-int ocfs2_create_new_lock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres, int ex, int local);
int ocfs2_drop_inode_locks(struct inode *inode);
-int ocfs2_data_lock_full(struct inode *inode,
- int write,
- int arg_flags);
-#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
-int ocfs2_data_lock_with_page(struct inode *inode,
- int write,
- struct page *page);
-void ocfs2_data_unlock(struct inode *inode,
- int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
-int ocfs2_meta_lock_full(struct inode *inode,
- struct ocfs2_journal_handle *handle,
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
+int ocfs2_inode_lock_atime(struct inode *inode,
+ struct vfsmount *vfsmnt,
+ int *level);
+int ocfs2_inode_lock_full_nested(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
- int arg_flags);
-int ocfs2_meta_lock_with_page(struct inode *inode,
- struct ocfs2_journal_handle *handle,
+ int arg_flags,
+ int subclass);
+int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page);
+/* Variants without special locking class or flags */
+#define ocfs2_inode_lock_full(i, r, e, f)\
+ ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
+#define ocfs2_inode_lock_nested(i, b, e, s)\
+ ocfs2_inode_lock_full_nested(i, b, e, 0, s)
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
-#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
-void ocfs2_meta_unlock(struct inode *inode,
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
+void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
+
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct ocfs2_refcount_tree;
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
+
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
-/* for the vote thread */
-void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres);
+/* for the downconvert thread */
+void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
-/* aids in debugging and tracking lvbs */
-void ocfs2_dump_meta_lvb_info(u64 level,
- const char *function,
- unsigned int line,
- struct ocfs2_lock_res *lockres);
-#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
-
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
deleted file mode 100644
index f226b220762..00000000000
--- a/fs/ocfs2/endian.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_ENDIAN_H
-#define OCFS2_ENDIAN_H
-
-static inline void le16_add_cpu(__le16 *var, u16 val)
-{
- *var = cpu_to_le16(le16_to_cpu(*var) + val);
-}
-
-static inline void le32_add_cpu(__le32 *var, u32 val)
-{
- *var = cpu_to_le32(le32_to_cpu(*var) + val);
-}
-
-static inline void le32_and_cpu(__le32 *var, u32 val)
-{
- *var = cpu_to_le32(le32_to_cpu(*var) & val);
-}
-
-static inline void be32_add_cpu(__be32 *var, u32 val)
-{
- *var = cpu_to_be32(be32_to_cpu(*var) + val);
-}
-
-#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index fb91089a60a..29651167190 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,11 +26,11 @@
#include <linux/fs.h>
#include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_EXPORT
#include <cluster/masklog.h>
#include "ocfs2.h"
+#include "alloc.h"
#include "dir.h"
#include "dlmglue.h"
#include "dcache.h"
@@ -38,6 +38,8 @@
#include "inode.h"
#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "ocfs2_trace.h"
struct ocfs2_inode_handle
{
@@ -45,42 +47,98 @@ struct ocfs2_inode_handle
u32 ih_generation;
};
-static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
+static struct dentry *ocfs2_get_dentry(struct super_block *sb,
+ struct ocfs2_inode_handle *handle)
{
- struct ocfs2_inode_handle *handle = vobjp;
struct inode *inode;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 blkno = handle->ih_blkno;
+ int status, set;
struct dentry *result;
- mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+ trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
- if (handle->ih_blkno == 0) {
- mlog_errno(-ESTALE);
- return ERR_PTR(-ESTALE);
+ if (blkno == 0) {
+ result = ERR_PTR(-ESTALE);
+ goto bail;
+ }
+
+ inode = ocfs2_ilookup(sb, blkno);
+ /*
+ * If the inode exists in memory, we only need to check it's
+ * generation number
+ */
+ if (inode)
+ goto check_gen;
+
+ /*
+ * This will synchronize us against ocfs2_delete_inode() on
+ * all nodes
+ */
+ status = ocfs2_nfs_sync_lock(osb, 1);
+ if (status < 0) {
+ mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+ goto check_err;
+ }
+
+ status = ocfs2_test_inode_bit(osb, blkno, &set);
+ trace_ocfs2_get_dentry_test_bit(status, set);
+ if (status < 0) {
+ if (status == -EINVAL) {
+ /*
+ * The blkno NFS gave us doesn't even show up
+ * as an inode, we return -ESTALE to be
+ * nice
+ */
+ status = -ESTALE;
+ } else
+ mlog(ML_ERROR, "test inode bit failed %d\n", status);
+ goto unlock_nfs_sync;
+ }
+
+ /* If the inode allocator bit is clear, this inode must be stale */
+ if (!set) {
+ status = -ESTALE;
+ goto unlock_nfs_sync;
}
- inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
+ inode = ocfs2_iget(osb, blkno, 0, 0);
+
+unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(osb, 1);
+
+check_err:
+ if (status < 0) {
+ if (status == -ESTALE) {
+ trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+ handle->ih_generation);
+ }
+ result = ERR_PTR(status);
+ goto bail;
+ }
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
- return (void *)inode;
+ result = (void *)inode;
+ goto bail;
}
+check_gen:
if (handle->ih_generation != inode->i_generation) {
iput(inode);
- mlog_errno(-ESTALE);
- return ERR_PTR(-ESTALE);
+ trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+ handle->ih_generation,
+ inode->i_generation);
+ result = ERR_PTR(-ESTALE);
+ goto bail;
}
- result = d_alloc_anon(inode);
-
- if (!result) {
- iput(inode);
- mlog_errno(-ENOMEM);
- return ERR_PTR(-ENOMEM);
- }
- result->d_op = &ocfs2_dentry_ops;
+ result = d_obtain_alias(inode);
+ if (IS_ERR(result))
+ mlog_errno(PTR_ERR(result));
- mlog_exit_ptr(result);
+bail:
+ trace_ocfs2_get_dentry_end(result);
return result;
}
@@ -89,18 +147,12 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
int status;
u64 blkno;
struct dentry *parent;
- struct inode *inode;
struct inode *dir = child->d_inode;
- struct buffer_head *dirent_bh = NULL;
- struct ocfs2_dir_entry *dirent;
- mlog_entry("(0x%p, '%.*s')\n", child,
- child->d_name.len, child->d_name.name);
+ trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno);
- mlog(0, "find parent of directory %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno);
-
- status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+ status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -108,77 +160,60 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
goto bail;
}
- status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
- &dirent);
+ status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
if (status < 0) {
parent = ERR_PTR(-ENOENT);
goto bail_unlock;
}
- inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
- if (IS_ERR(inode)) {
- mlog(ML_ERROR, "Unable to create inode %llu\n",
- (unsigned long long)blkno);
- parent = ERR_PTR(-EACCES);
- goto bail_unlock;
- }
-
- parent = d_alloc_anon(inode);
- if (!parent) {
- iput(inode);
- parent = ERR_PTR(-ENOMEM);
- }
-
- parent->d_op = &ocfs2_dentry_ops;
+ parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
bail_unlock:
- ocfs2_meta_unlock(dir, 0);
-
- if (dirent_bh)
- brelse(dirent_bh);
+ ocfs2_inode_unlock(dir, 0);
bail:
- mlog_exit_ptr(parent);
+ trace_ocfs2_get_parent_end(parent);
return parent;
}
-static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
- int connectable)
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type = 1;
u64 blkno;
u32 generation;
-
- mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
- dentry->d_name.len, dentry->d_name.name,
- fh, len, connectable);
-
- if (len < 3 || (connectable && len < 6)) {
- mlog(ML_ERROR, "fh buffer is too small for encoding\n");
- type = 255;
+ __le32 *fh = (__force __le32 *) fh_in;
+
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then. Somehow"
+ trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ fh, len, connectable);
+#endif
+
+ if (parent && (len < 6)) {
+ *max_len = 6;
+ type = FILEID_INVALID;
+ goto bail;
+ } else if (len < 3) {
+ *max_len = 3;
+ type = FILEID_INVALID;
goto bail;
}
blkno = OCFS2_I(inode)->ip_blkno;
generation = inode->i_generation;
- mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
- (unsigned long long)blkno, generation);
+ trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
len = 3;
fh[0] = cpu_to_le32((u32)(blkno >> 32));
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[2] = cpu_to_le32(generation);
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
+ if (parent) {
blkno = OCFS2_I(parent)->ip_blkno;
generation = parent->i_generation;
@@ -186,69 +221,51 @@ static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[5] = cpu_to_le32(generation);
- spin_unlock(&dentry->d_lock);
-
len = 6;
type = 2;
- mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
- (unsigned long long)blkno, generation);
+ trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+ generation);
}
-
+
*max_len = len;
bail:
- mlog_exit(type);
+ trace_ocfs2_encode_fh_type(type);
return type;
}
-static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
- int fh_len, int fileid_type,
- int (*acceptable)(void *context,
- struct dentry *de),
- void *context)
+static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
{
- struct ocfs2_inode_handle handle, parent;
- struct dentry *ret = NULL;
-
- mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
- sb, fh, fh_len, fileid_type, acceptable, context);
-
- if (fh_len < 3 || fileid_type > 2)
- goto bail;
-
- if (fileid_type == 2) {
- if (fh_len < 6)
- goto bail;
+ struct ocfs2_inode_handle handle;
- parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
- parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
- parent.ih_generation = le32_to_cpu(fh[5]);
+ if (fh_len < 3 || fh_type > 2)
+ return NULL;
- mlog(0, "Decoding parent: blkno: %llu, generation: %u\n",
- (unsigned long long)parent.ih_blkno,
- parent.ih_generation);
- }
-
- handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
- handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
- handle.ih_generation = le32_to_cpu(fh[2]);
+ handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32;
+ handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]);
+ handle.ih_generation = le32_to_cpu(fid->raw[2]);
+ return ocfs2_get_dentry(sb, &handle);
+}
- mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
- (unsigned long long)handle.ih_blkno, handle.ih_generation);
+static struct dentry *ocfs2_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct ocfs2_inode_handle parent;
- ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
- acceptable, context);
+ if (fh_type != 2 || fh_len < 6)
+ return NULL;
-bail:
- mlog_exit_ptr(ret);
- return ret;
+ parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32;
+ parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]);
+ parent.ih_generation = le32_to_cpu(fid->raw[5]);
+ return ocfs2_get_dentry(sb, &parent);
}
-struct export_operations ocfs2_export_ops = {
- .decode_fh = ocfs2_decode_fh,
+const struct export_operations ocfs2_export_ops = {
.encode_fh = ocfs2_encode_fh,
-
+ .fh_to_dentry = ocfs2_fh_to_dentry,
+ .fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
- .get_dentry = ocfs2_get_dentry,
};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index 5b77ee7866e..41a738678c3 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -26,6 +26,8 @@
#ifndef OCFS2_EXPORT_H
#define OCFS2_EXPORT_H
-extern struct export_operations ocfs2_export_ops;
+#include <linux/exportfs.h>
+
+extern const struct export_operations ocfs2_export_ops;
#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index fcd4475d1f8..767370b656c 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
*
* extent_map.c
*
- * In-memory extent map for OCFS2. Man, this code was prettier in
- * the library.
+ * Block/Cluster mapping functions
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
@@ -25,1017 +24,971 @@
#include <linux/fs.h>
#include <linux/init.h>
-#include <linux/types.h>
#include <linux/slab.h>
-#include <linux/rbtree.h>
+#include <linux/types.h>
+#include <linux/fiemap.h>
-#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
+#include "symlink.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-
/*
- * SUCK SUCK SUCK
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
+ * The extent caching implementation is intentionally trivial.
+ *
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
*/
-struct ocfs2_extent_map_entry {
- struct rb_node e_node;
- int e_tree_depth;
- struct ocfs2_extent_rec e_rec;
-};
-
-struct ocfs2_em_insert_context {
- int need_left;
- int need_right;
- struct ocfs2_extent_map_entry *new_ent;
- struct ocfs2_extent_map_entry *old_ent;
- struct ocfs2_extent_map_entry *left_ent;
- struct ocfs2_extent_map_entry *right_ent;
-};
-
-static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
-
-
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- u32 cpos, u32 clusters,
- struct rb_node ***ret_p,
- struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
- struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth,
- struct ocfs2_em_insert_context *ctxt);
-
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
- u32 cpos, u32 clusters)
+void ocfs2_extent_map_init(struct inode *inode)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ oi->ip_extent_map.em_num_items = 0;
+ INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
+}
+
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+ unsigned int cpos,
+ struct ocfs2_extent_map_item **ret_emi)
{
- if (le32_to_cpu(rec->e_cpos) > cpos)
- return 0;
- if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters))
- return 0;
- return 1;
+ unsigned int range;
+ struct ocfs2_extent_map_item *emi;
+
+ *ret_emi = NULL;
+
+ list_for_each_entry(emi, &em->em_list, ei_list) {
+ range = emi->ei_cpos + emi->ei_clusters;
+
+ if (cpos >= emi->ei_cpos && cpos < range) {
+ list_move(&emi->ei_list, &em->em_list);
+
+ *ret_emi = emi;
+ break;
+ }
+ }
}
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+ unsigned int *phys, unsigned int *len,
+ unsigned int *flags)
+{
+ unsigned int coff;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map_item *emi;
+
+ spin_lock(&oi->ip_lock);
+
+ __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+ if (emi) {
+ coff = cpos - emi->ei_cpos;
+ *phys = emi->ei_phys + coff;
+ if (len)
+ *len = emi->ei_clusters - coff;
+ if (flags)
+ *flags = emi->ei_flags;
+ }
+
+ spin_unlock(&oi->ip_lock);
+
+ if (emi == NULL)
+ return -ENOENT;
+
+ return 0;
+}
/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock. This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
- *
- * The rb_node garbage lets insertion share the search. Trivial
- * callers pass NULL.
+ * Forget about all clusters equal to or greater than cpos.
*/
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
- u32 cpos, u32 clusters,
- struct rb_node ***ret_p,
- struct rb_node **ret_parent)
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
- struct rb_node **p = &em->em_extents.rb_node;
- struct rb_node *parent = NULL;
- struct ocfs2_extent_map_entry *ent = NULL;
-
- while (*p)
- {
- parent = *p;
- ent = rb_entry(parent, struct ocfs2_extent_map_entry,
- e_node);
- if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
- p = &(*p)->rb_left;
- ent = NULL;
- } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
- le32_to_cpu(ent->e_rec.e_clusters))) {
- p = &(*p)->rb_right;
- ent = NULL;
- } else
- break;
+ struct ocfs2_extent_map_item *emi, *n;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map *em = &oi->ip_extent_map;
+ LIST_HEAD(tmp_list);
+ unsigned int range;
+
+ spin_lock(&oi->ip_lock);
+ list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
+ if (emi->ei_cpos >= cpos) {
+ /* Full truncate of this record. */
+ list_move(&emi->ei_list, &tmp_list);
+ BUG_ON(em->em_num_items == 0);
+ em->em_num_items--;
+ continue;
+ }
+
+ range = emi->ei_cpos + emi->ei_clusters;
+ if (range > cpos) {
+ /* Partial truncate */
+ emi->ei_clusters = cpos - emi->ei_cpos;
+ }
}
+ spin_unlock(&oi->ip_lock);
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
- return ent;
+ list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
+ list_del(&emi->ei_list);
+ kfree(emi);
+ }
}
/*
- * Find the leaf containing the interval we want. While we're on our
- * way down the tree, fill in every record we see at any depth, because
- * we might want it later.
- *
- * Note that this code is run without ip_lock. That's because it
- * sleeps while reading. If someone is also filling the extent list at
- * the same time we are, we might have to restart.
+ * Is any part of emi2 contained within emi1
*/
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
- u32 cpos, u32 clusters,
- struct ocfs2_extent_list *el)
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+ struct ocfs2_extent_map_item *emi2)
{
- int i, ret;
- struct buffer_head *eb_bh = NULL;
- u64 blkno;
- u32 rec_end;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_rec *rec;
+ unsigned int range1, range2;
/*
- * The bh data containing the el cannot change here, because
- * we hold alloc_sem. So we can do this without other
- * locks.
+ * Check if logical start of emi2 is inside emi1
*/
- while (el->l_tree_depth)
- {
- blkno = 0;
- for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
- rec_end = (le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters));
-
- ret = -EBADR;
- if (rec_end > OCFS2_I(inode)->ip_clusters) {
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
- i,
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_clusters);
- goto out_free;
- }
+ range1 = emi1->ei_cpos + emi1->ei_clusters;
+ if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+ return 1;
- if (rec_end <= cpos) {
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
- continue;
- }
- if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
- }
- continue;
- }
+ /*
+ * Check if logical end of emi2 is inside emi1
+ */
+ range2 = emi2->ei_cpos + emi2->ei_clusters;
+ if (range2 > emi1->ei_cpos && range2 <= range1)
+ return 1;
- /*
- * We've found a record that matches our
- * interval. We don't insert it because we're
- * about to traverse it.
- */
-
- /* Check to see if we're stradling */
- ret = -ESRCH;
- if (!ocfs2_extent_rec_contains_clusters(rec,
- cpos,
- clusters)) {
- mlog_errno(ret);
- goto out_free;
- }
+ return 0;
+}
- /*
- * If we've already found a record, the el has
- * two records covering the same interval.
- * EEEK!
- */
- ret = -EBADR;
- if (blkno) {
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
- cpos, clusters,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, i,
- (unsigned long long)le64_to_cpu(rec->e_blkno));
- goto out_free;
- }
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+ struct ocfs2_extent_map_item *src)
+{
+ dest->ei_cpos = src->ei_cpos;
+ dest->ei_phys = src->ei_phys;
+ dest->ei_clusters = src->ei_clusters;
+ dest->ei_flags = src->ei_flags;
+}
- blkno = le64_to_cpu(rec->e_blkno);
- }
+/*
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
+ */
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+ struct ocfs2_extent_map_item *ins)
+{
+ /*
+ * Handle contiguousness
+ */
+ if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+ ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+ ins->ei_flags == emi->ei_flags) {
+ emi->ei_clusters += ins->ei_clusters;
+ return 1;
+ } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+ (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
+ ins->ei_flags == emi->ei_flags) {
+ emi->ei_phys = ins->ei_phys;
+ emi->ei_cpos = ins->ei_cpos;
+ emi->ei_clusters += ins->ei_clusters;
+ return 1;
+ }
- /*
- * We don't support holes, and we're still up
- * in the branches, so we'd better have found someone
- */
- ret = -EBADR;
- if (!blkno) {
- ocfs2_error(inode->i_sb,
- "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
- cpos, clusters,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_errno(ret);
- goto out_free;
- }
+ /*
+ * Overlapping extents - this shouldn't happen unless we've
+ * split an extent to change it's flags. That is exceedingly
+ * rare, so there's no sense in trying to optimize it yet.
+ */
+ if (ocfs2_ei_is_contained(emi, ins) ||
+ ocfs2_ei_is_contained(ins, emi)) {
+ ocfs2_copy_emi_fields(emi, ins);
+ return 1;
+ }
- if (eb_bh) {
- brelse(eb_bh);
- eb_bh = NULL;
- }
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- blkno, &eb_bh, OCFS2_BH_CACHED,
- inode);
- if (ret) {
- mlog_errno(ret);
- goto out_free;
- }
- eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EIO;
- goto out_free;
+ /* No merge was possible. */
+ return 0;
+}
+
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+ struct ocfs2_extent_rec *rec)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_extent_map *em = &oi->ip_extent_map;
+ struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+ struct ocfs2_extent_map_item ins;
+
+ ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+ ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(rec->e_blkno));
+ ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ ins.ei_flags = rec->e_flags;
+
+search:
+ spin_lock(&oi->ip_lock);
+
+ list_for_each_entry(emi, &em->em_list, ei_list) {
+ if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+ list_move(&emi->ei_list, &em->em_list);
+ spin_unlock(&oi->ip_lock);
+ goto out;
}
- el = &eb->h_list;
}
- BUG_ON(el->l_tree_depth);
+ /*
+ * No item could be merged.
+ *
+ * Either allocate and add a new item, or overwrite the last recently
+ * inserted.
+ */
- for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
+ if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+ if (new_emi == NULL) {
+ spin_unlock(&oi->ip_lock);
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
- OCFS2_I(inode)->ip_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- ocfs2_error(inode->i_sb,
- "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
- i,
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_clusters);
- return ret;
- }
+ new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+ if (new_emi == NULL)
+ goto out;
- ret = ocfs2_extent_map_insert(inode, rec,
- le16_to_cpu(el->l_tree_depth));
- if (ret && (ret != -EEXIST)) {
- mlog_errno(ret);
- goto out_free;
+ goto search;
}
+
+ ocfs2_copy_emi_fields(new_emi, &ins);
+ list_add(&new_emi->ei_list, &em->em_list);
+ em->em_num_items++;
+ new_emi = NULL;
+ } else {
+ BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+ emi = list_entry(em->em_list.prev,
+ struct ocfs2_extent_map_item, ei_list);
+ list_move(&emi->ei_list, &em->em_list);
+ ocfs2_copy_emi_fields(emi, &ins);
}
- ret = 0;
+ spin_unlock(&oi->ip_lock);
-out_free:
- if (eb_bh)
- brelse(eb_bh);
-
- return ret;
+out:
+ kfree(new_emi);
}
-/*
- * This lookup actually will read from disk. It has one invariant:
- * It will never re-traverse blocks. This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
- */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
- u32 cpos,
- u32 clusters,
- struct ocfs2_extent_map_entry **ret_ent)
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+ struct ocfs2_dinode *di)
{
- int ret;
- u64 blkno;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
- struct buffer_head *bh = NULL;
+ int ret, next_free;
+ u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+ struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di;
struct ocfs2_extent_list *el;
- spin_lock(&OCFS2_I(inode)->ip_lock);
- ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
- if (ent) {
- if (!ent->e_tree_depth) {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- *ret_ent = ent;
- return 0;
- }
- blkno = le64_to_cpu(ent->e_rec.e_blkno);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
- OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- if (bh)
- brelse(bh);
- return ret;
- }
- eb = (struct ocfs2_extent_block *)bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- brelse(bh);
- return -EIO;
- }
- el = &eb->h_list;
- } else {
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- if (bh)
- brelse(bh);
- return ret;
- }
- di = (struct ocfs2_dinode *)bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- brelse(bh);
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
- return -EIO;
- }
- el = &di->id2.i_list;
- }
-
- ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
- brelse(bh);
+ ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto out;
}
- ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
- if (!ent) {
- ret = -ESRCH;
- mlog_errno(ret);
- return ret;
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
}
- /* FIXME: Make sure this isn't a corruption */
- BUG_ON(ent->e_tree_depth);
+ next_free = le16_to_cpu(el->l_next_free_rec);
- *ret_ent = ent;
+ if (next_free == 0 ||
+ (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+ ret = 1;
- return 0;
+out:
+ brelse(eb_bh);
+ return ret;
}
/*
- * Callers must hold ip_lock. This can insert pieces of the tree,
- * thus racing lookup if the lock weren't held.
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
*/
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
- struct ocfs2_extent_map_entry *ent)
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+ u32 v_cluster)
{
- struct rb_node **p, *parent;
- struct ocfs2_extent_map_entry *old_ent;
+ int i;
+ struct ocfs2_extent_rec *rec;
- old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
- le32_to_cpu(ent->e_rec.e_clusters),
- &p, &parent);
- if (old_ent)
- return -EEXIST;
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
- rb_link_node(&ent->e_node, parent, p);
- rb_insert_color(&ent->e_node, &em->em_extents);
+ if (v_cluster < le32_to_cpu(rec->e_cpos))
+ break;
+ }
- return 0;
+ return i;
}
-
/*
- * Simple rule: on any return code other than -EAGAIN, anything left
- * in the insert_context will be freed.
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
+ *
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
*
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert. It is not an actual error, but it
- * tells the caller we have no more work to do.
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
*/
-static int ocfs2_extent_map_try_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth,
- struct ocfs2_em_insert_context *ctxt)
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters)
{
- int ret;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *old_ent;
+ int ret, i;
+ struct buffer_head *next_eb_bh = NULL;
+ struct ocfs2_extent_block *eb, *next_eb;
- ctxt->need_left = 0;
- ctxt->need_right = 0;
- ctxt->old_ent = NULL;
+ i = ocfs2_search_for_hole_index(el, v_cluster);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
- if (!ret) {
- ctxt->new_ent = NULL;
- goto out_unlock;
- }
-
- /* Since insert_entry failed, the map MUST have old_ent */
- old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
- le32_to_cpu(rec->e_clusters),
- NULL, NULL);
-
- BUG_ON(!old_ent);
-
- if (old_ent->e_tree_depth < tree_depth) {
- /* Another thread beat us to the lower tree_depth */
- ret = -EEXIST;
- goto out_unlock;
- }
+ if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+ eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- if (old_ent->e_tree_depth == tree_depth) {
/*
- * Another thread beat us to this tree_depth.
- * Let's make sure we agree with that thread (the
- * extent_rec should be identical).
+ * Check the next leaf for any extents.
*/
- if (!memcmp(rec, &old_ent->e_rec,
- sizeof(struct ocfs2_extent_rec)))
- ret = 0;
- else
- /* FIXME: Should this be ESRCH/EBADR??? */
- ret = -EEXIST;
-
- goto out_unlock;
- }
- /*
- * We do it in this order specifically so that no actual tree
- * changes occur until we have all the pieces we need. We
- * don't want malloc failures to leave an inconsistent tree.
- * Whenever we drop the lock, another process could be
- * inserting. Also note that, if another process just beat us
- * to an insert, we might not need the same pieces we needed
- * the first go round. In the end, the pieces we need will
- * be used, and the pieces we don't will be freed.
- */
- ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
- le32_to_cpu(old_ent->e_rec.e_cpos));
- ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
- le32_to_cpu(old_ent->e_rec.e_clusters)) >
- (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
- ret = -EAGAIN;
- if (ctxt->need_left) {
- if (!ctxt->left_ent)
- goto out_unlock;
- *(ctxt->left_ent) = *old_ent;
- ctxt->left_ent->e_rec.e_clusters =
- cpu_to_le32(le32_to_cpu(rec->e_cpos) -
- le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
- }
- if (ctxt->need_right) {
- if (!ctxt->right_ent)
- goto out_unlock;
- *(ctxt->right_ent) = *old_ent;
- ctxt->right_ent->e_rec.e_cpos =
- cpu_to_le32(le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters));
- ctxt->right_ent->e_rec.e_clusters =
- cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
- le32_to_cpu(old_ent->e_rec.e_clusters)) -
- le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
- }
+ if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+ goto no_more_extents;
- rb_erase(&old_ent->e_node, &em->em_extents);
- /* Now that he's erased, set him up for deletion */
- ctxt->old_ent = old_ent;
+ ret = ocfs2_read_extent_block(ci,
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &next_eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- if (ctxt->need_left) {
- ret = ocfs2_extent_map_insert_entry(em,
- ctxt->left_ent);
- if (ret)
- goto out_unlock;
- ctxt->left_ent = NULL;
+ next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
+ el = &next_eb->h_list;
+ i = ocfs2_search_for_hole_index(el, v_cluster);
}
- if (ctxt->need_right) {
- ret = ocfs2_extent_map_insert_entry(em,
- ctxt->right_ent);
- if (ret)
- goto out_unlock;
- ctxt->right_ent = NULL;
+no_more_extents:
+ if (i == le16_to_cpu(el->l_next_free_rec)) {
+ /*
+ * We're at the end of our existing allocation. Just
+ * return the maximum number of clusters we could
+ * possibly allocate.
+ */
+ *num_clusters = UINT_MAX - v_cluster;
+ } else {
+ *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
}
- ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-
- if (!ret)
- ctxt->new_ent = NULL;
-
-out_unlock:
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
+ ret = 0;
+out:
+ brelse(next_eb_bh);
return ret;
}
-
-static int ocfs2_extent_map_insert(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- int tree_depth)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 v_cluster, unsigned int *hole_len,
+ struct ocfs2_extent_rec *ret_rec,
+ unsigned int *is_last)
{
- int ret;
- struct ocfs2_em_insert_context ctxt = {0, };
+ int i, ret, tree_height, len;
+ struct ocfs2_dinode *di;
+ struct ocfs2_extent_block *uninitialized_var(eb);
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+ struct buffer_head *eb_bh = NULL;
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
- OCFS2_I(inode)->ip_map.em_clusters) {
- ret = -EBADR;
- mlog_errno(ret);
- return ret;
- }
+ memset(ret_rec, 0, sizeof(*ret_rec));
+ if (is_last)
+ *is_last = 0;
- /* Zero e_clusters means a truncated tail record. It better be EOF */
- if (!rec->e_clusters) {
- if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
- OCFS2_I(inode)->ip_map.em_clusters) {
- ret = -EBADR;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ el = &di->id2.i_list;
+ tree_height = le16_to_cpu(el->l_tree_depth);
+
+ if (tree_height > 0) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+ &eb_bh);
+ if (ret) {
mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- return ret;
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
}
+ }
+
+ i = ocfs2_search_extent_list(el, v_cluster);
+ if (i == -1) {
+ /*
+ * Holes can be larger than the maximum size of an
+ * extent, so we return their lengths in a separate
+ * field.
+ */
+ if (hole_len) {
+ ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+ el, eb_bh,
+ v_cluster, &len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- /* Ignore the truncated tail */
- return 0;
+ *hole_len = len;
+ }
+ goto out_hole;
}
- ret = -ENOMEM;
- ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.new_ent) {
- mlog_errno(ret);
- return ret;
+ rec = &el->l_recs[i];
+
+ BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+ if (!rec->e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0)", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
}
- ctxt.new_ent->e_rec = *rec;
- ctxt.new_ent->e_tree_depth = tree_depth;
-
- do {
- ret = -ENOMEM;
- if (ctxt.need_left && !ctxt.left_ent) {
- ctxt.left_ent =
- kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.left_ent)
- break;
- }
- if (ctxt.need_right && !ctxt.right_ent) {
- ctxt.right_ent =
- kmem_cache_alloc(ocfs2_em_ent_cachep,
- GFP_NOFS);
- if (!ctxt.right_ent)
- break;
+ *ret_rec = *rec;
+
+ /*
+ * Checking for last extent is potentially expensive - we
+ * might have to look at the next leaf over to see if it's
+ * empty.
+ *
+ * The first two checks are to see whether the caller even
+ * cares for this information, and if the extent is at least
+ * the last in it's list.
+ *
+ * If those hold true, then the extent is last if any of the
+ * additional conditions hold true:
+ * - Extent list is in-inode
+ * - Extent list is right-most
+ * - Extent list is 2nd to rightmost, with empty right-most
+ */
+ if (is_last) {
+ if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+ if (tree_height == 0)
+ *is_last = 1;
+ else if (eb->h_blkno == di->i_last_eb_blk)
+ *is_last = 1;
+ else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+ ret = ocfs2_last_eb_is_empty(inode, di);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (ret == 1)
+ *is_last = 1;
+ }
}
+ }
+
+out_hole:
+ ret = 0;
+out:
+ brelse(eb_bh);
+ return ret;
+}
- ret = ocfs2_extent_map_try_insert(inode, rec,
- tree_depth, &ctxt);
- } while (ret == -EAGAIN);
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+ u32 v_cluster,
+ struct ocfs2_extent_rec *rec,
+ u32 *p_cluster, u32 *num_clusters)
- if ((ret < 0) && (ret != -EEXIST))
- mlog_errno(ret);
+{
+ u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
- if (ctxt.left_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
- if (ctxt.right_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
- if (ctxt.old_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
- if (ctxt.new_ent)
- kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+ *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+ *p_cluster = *p_cluster + coff;
- return ret;
+ if (num_clusters)
+ *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
}
-/*
- * Append this record to the tail of the extent map. It must be
- * tree_depth 0. The record might be an extension of an existing
- * record, and as such that needs to be handled. eg:
- *
- * Existing record in the extent map:
- *
- * cpos = 10, len = 10
- * |---------|
- *
- * New Record:
- *
- * cpos = 10, len = 20
- * |------------------|
- *
- * The passed record is the new on-disk record. The new_clusters value
- * is how many clusters were added to the file. If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters. If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
- */
-int ocfs2_extent_map_append(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- u32 new_clusters)
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ struct ocfs2_extent_list *el,
+ unsigned int *extent_flags)
{
- int ret;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
- struct ocfs2_extent_rec *old;
+ int ret = 0, i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec;
+ u32 coff;
- BUG_ON(!new_clusters);
- BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+ &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
- }
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
- mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters)) !=
- (em->em_clusters + new_clusters),
- "Inode %llu:\n"
- "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
- "em->em_clusters = %u + new_clusters = %u = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
- le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
- em->em_clusters, new_clusters,
- em->em_clusters + new_clusters);
-
- em->em_clusters += new_clusters;
-
- ret = -ENOENT;
- if (le32_to_cpu(rec->e_clusters) > new_clusters) {
- /* This is a contiguous append */
- ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
- NULL, NULL);
- if (ent) {
- old = &ent->e_rec;
- BUG_ON((le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters)) !=
- (le32_to_cpu(old->e_cpos) +
- le32_to_cpu(old->e_clusters) +
- new_clusters));
- if (ent->e_tree_depth == 0) {
- BUG_ON(le32_to_cpu(old->e_cpos) !=
- le32_to_cpu(rec->e_cpos));
- BUG_ON(le64_to_cpu(old->e_blkno) !=
- le64_to_cpu(rec->e_blkno));
- ret = 0;
- }
- /*
- * Let non-leafs fall through as -ENOENT to
- * force insertion of the new leaf.
- */
- le32_add_cpu(&old->e_clusters, new_clusters);
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "xattr leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
}
}
- if (ret == -ENOENT)
- ret = ocfs2_extent_map_insert(inode, rec, 0);
- if (ret < 0)
+ i = ocfs2_search_extent_list(el, v_cluster);
+ if (i == -1) {
+ ret = -EROFS;
mlog_errno(ret);
+ goto out;
+ } else {
+ rec = &el->l_recs[i];
+ BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+ if (!rec->e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0) in xattr", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+ coff = v_cluster - le32_to_cpu(rec->e_cpos);
+ *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(rec->e_blkno));
+ *p_cluster = *p_cluster + coff;
+ if (num_clusters)
+ *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+ if (extent_flags)
+ *extent_flags = rec->e_flags;
+ }
+out:
+ if (eb_bh)
+ brelse(eb_bh);
return ret;
}
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
-
-/*
- * Look up the record containing this cluster offset. This record is
- * part of the extent map. Do not free it. Any changes you make to
- * it will reflect in the extent map. So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
- *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
- *
- * The lookup does not read from disk. If the map isn't filled in for
- * an entry, you won't find it.
- *
- * Also note that the returned record is valid until alloc_sem is
- * dropped. After that, truncate and extend can happen. Caveat Emptor.
- */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
- struct ocfs2_extent_rec **rec,
- int *tree_depth)
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ unsigned int *extent_flags)
{
- int ret = -ENOENT;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
+ int ret;
+ unsigned int uninitialized_var(hole_len), flags = 0;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
- *rec = NULL;
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = -ERANGE;
+ mlog_errno(ret);
+ goto out;
+ }
- if (cpos >= OCFS2_I(inode)->ip_clusters)
- return -EINVAL;
+ ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+ if (ret == 0)
+ goto out;
- if (cpos >= em->em_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters ;
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
- NULL, NULL);
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+ &rec, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- if (ent) {
- *rec = &ent->e_rec;
- if (tree_depth)
- *tree_depth = ent->e_tree_depth;
- ret = 0;
+ if (rec.e_blkno == 0ULL) {
+ /*
+ * A hole was found. Return some canned values that
+ * callers can key on. If asked for, num_clusters will
+ * be populated with the size of the hole.
+ */
+ *p_cluster = 0;
+ if (num_clusters) {
+ *num_clusters = hole_len;
+ }
+ } else {
+ ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+ p_cluster, num_clusters);
+ flags = rec.e_flags;
+
+ ocfs2_extent_map_insert_rec(inode, &rec);
}
+ if (extent_flags)
+ *extent_flags = flags;
+
+out:
+ brelse(di_bh);
return ret;
}
-int ocfs2_extent_map_get_clusters(struct inode *inode,
- u32 v_cpos, int count,
- u32 *p_cpos, int *ret_count)
+/*
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
+ */
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+ u64 *ret_count, unsigned int *extent_flags)
{
int ret;
- u32 coff, ccount;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent = NULL;
-
- *p_cpos = ccount = 0;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 cpos, num_clusters, p_cluster;
+ u64 boff = 0;
- if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
- return -EINVAL;
+ cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
- if ((v_cpos + count) > em->em_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
+ extent_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
+ /*
+ * p_cluster == 0 indicates a hole.
+ */
+ if (p_cluster) {
+ boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ boff += (v_blkno & (u64)(bpc - 1));
+ }
- ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
- if (ret)
- return ret;
+ *p_blkno = boff;
- if (ent) {
- /* We should never find ourselves straddling an interval */
- if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
- v_cpos,
- count))
- return -ESRCH;
+ if (ret_count) {
+ *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ *ret_count -= v_blkno & (u64)(bpc - 1);
+ }
- coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
- *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
- le64_to_cpu(ent->e_rec.e_blkno)) +
- coff;
+out:
+ return ret;
+}
- if (ret_count)
- *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+/*
+ * The ocfs2_fiemap_inline() may be a little bit misleading, since
+ * it not only handles the fiemap for inlined files, but also deals
+ * with the fast symlink, cause they have no difference for extent
+ * mapping per se.
+ */
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+ struct fiemap_extent_info *fieinfo,
+ u64 map_start)
+{
+ int ret;
+ unsigned int id_count;
+ struct ocfs2_dinode *di;
+ u64 phys;
+ u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ if (ocfs2_inode_is_fast_symlink(inode))
+ id_count = ocfs2_fast_symlink_chars(inode->i_sb);
+ else
+ id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+ if (map_start < id_count) {
+ phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+ if (ocfs2_inode_is_fast_symlink(inode))
+ phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
+ else
+ phys += offsetof(struct ocfs2_dinode,
+ id2.i_data.id_data);
- return 0;
+ ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+ flags);
+ if (ret < 0)
+ return ret;
}
-
- return -ENOENT;
+ return 0;
}
-#endif /* 0 */
+#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
-int ocfs2_extent_map_get_blocks(struct inode *inode,
- u64 v_blkno, int count,
- u64 *p_blkno, int *ret_count)
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 map_start, u64 map_len)
{
- int ret;
- u64 boff;
- u32 cpos, clusters;
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- struct ocfs2_extent_map_entry *ent = NULL;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_rec *rec;
-
- *p_blkno = 0;
+ int ret, is_last;
+ u32 mapping_end, cpos;
+ unsigned int hole_size;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u64 len_bytes, phys_bytes, virt_bytes;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+ if (ret)
+ return ret;
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
- clusters = ocfs2_blocks_to_clusters(inode->i_sb,
- (u64)count + bpc - 1);
- if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
- ret = -EINVAL;
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
mlog_errno(ret);
- return ret;
+ goto out;
}
- if ((cpos + clusters) > em->em_clusters) {
- /*
- * Size changed underneath us on disk. Drop any
- * straddling records and update our idea of
- * i_clusters
- */
- ocfs2_extent_map_drop(inode, em->em_clusters - 1);
- em->em_clusters = OCFS2_I(inode)->ip_clusters;
- }
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
- if (ret) {
- mlog_errno(ret);
- return ret;
+ /*
+ * Handle inline-data and fast symlink separately.
+ */
+ if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+ ocfs2_inode_is_fast_symlink(inode)) {
+ ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+ goto out_unlock;
}
- if (ent)
- {
- rec = &ent->e_rec;
+ cpos = map_start >> osb->s_clustersize_bits;
+ mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+ map_start + map_len);
+ is_last = 0;
+ while (cpos < mapping_end && !is_last) {
+ u32 fe_flags;
- /* We should never find ourselves straddling an interval */
- if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
- ret = -ESRCH;
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+ &hole_size, &rec, &is_last);
+ if (ret) {
mlog_errno(ret);
- return ret;
+ goto out_unlock;
}
- boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
- le32_to_cpu(rec->e_cpos));
- boff += (v_blkno & (u64)(bpc - 1));
- *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
-
- if (ret_count) {
- *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(rec->e_clusters)) - boff;
+ if (rec.e_blkno == 0ULL) {
+ cpos += hole_size;
+ continue;
}
- return 0;
+ fe_flags = 0;
+ if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+ fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+ if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+ fe_flags |= FIEMAP_EXTENT_SHARED;
+ if (is_last)
+ fe_flags |= FIEMAP_EXTENT_LAST;
+ len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+ phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+ virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+ ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+ len_bytes, fe_flags);
+ if (ret)
+ break;
+
+ cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
}
- return -ENOENT;
-}
+ if (ret > 0)
+ ret = 0;
-int ocfs2_extent_map_init(struct inode *inode)
-{
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+out_unlock:
+ brelse(di_bh);
- em->em_extents = RB_ROOT;
- em->em_clusters = 0;
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
- return 0;
+ ocfs2_inode_unlock(inode, 0);
+out:
+
+ return ret;
}
-/* Needs the lock */
-static void __ocfs2_extent_map_drop(struct inode *inode,
- u32 new_clusters,
- struct rb_node **free_head,
- struct ocfs2_extent_map_entry **tail_ent)
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
{
- struct rb_node *node, *next;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
-
- *free_head = NULL;
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+ unsigned int is_last = 0, is_data = 0;
+ u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 cpos, cend, clen, hole_size;
+ u64 extoff, extlen;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
- ent = NULL;
- node = rb_last(&em->em_extents);
- while (node)
- {
- next = rb_prev(node);
+ BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
- ent = rb_entry(node, struct ocfs2_extent_map_entry,
- e_node);
- if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
- break;
-
- rb_erase(&ent->e_node, &em->em_extents);
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- node->rb_right = *free_head;
- *free_head = node;
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ent = NULL;
- node = next;
+ if (*offset >= i_size_read(inode)) {
+ ret = -ENXIO;
+ goto out_unlock;
}
- /* Do we have an entry straddling new_clusters? */
- if (tail_ent) {
- if (ent &&
- ((le32_to_cpu(ent->e_rec.e_cpos) +
- le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
- *tail_ent = ent;
- else
- *tail_ent = NULL;
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (whence == SEEK_HOLE)
+ *offset = i_size_read(inode);
+ goto out_unlock;
}
-}
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
- struct rb_node *node;
- struct ocfs2_extent_map_entry *ent;
+ clen = 0;
+ cpos = *offset >> cs_bits;
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
- while (free_head) {
- node = free_head;
- free_head = node->rb_right;
+ while (cpos < cend && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+ &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
- ent = rb_entry(node, struct ocfs2_extent_map_entry,
- e_node);
- kmem_cache_free(ocfs2_em_ent_cachep, ent);
- }
-}
+ extoff = cpos;
+ extoff <<= cs_bits;
-/*
- * Remove all entries past new_clusters, inclusive of an entry that
- * contains new_clusters. This is effectively a cache forget.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
- */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
-{
- struct rb_node *free_head = NULL;
- struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
- struct ocfs2_extent_map_entry *ent;
+ if (rec.e_blkno == 0ULL) {
+ clen = hole_size;
+ is_data = 0;
+ } else {
+ clen = le16_to_cpu(rec.e_leaf_clusters) -
+ (cpos - le32_to_cpu(rec.e_cpos));
+ is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
+ }
- spin_lock(&OCFS2_I(inode)->ip_lock);
+ if ((!is_data && whence == SEEK_HOLE) ||
+ (is_data && whence == SEEK_DATA)) {
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
- __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+ if (!is_last)
+ cpos += clen;
+ }
- if (ent) {
- rb_erase(&ent->e_node, &em->em_extents);
- ent->e_node.rb_right = free_head;
- free_head = &ent->e_node;
+ if (whence == SEEK_HOLE) {
+ extoff = cpos;
+ extoff <<= cs_bits;
+ extlen = clen;
+ extlen <<= cs_bits;
+
+ if ((extoff + extlen) > i_size_read(inode))
+ extlen = i_size_read(inode) - extoff;
+ extoff += extlen;
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
}
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ ret = -ENXIO;
- if (free_head)
- __ocfs2_extent_map_drop_cleanup(free_head);
+out_unlock:
- return 0;
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ return ret;
}
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one. This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
- struct rb_node *free_head = NULL;
- struct ocfs2_extent_map_entry *ent = NULL;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
+ int rc = 0;
+ u64 p_block, p_count;
+ int i, count, done = 0;
+
+ trace_ocfs2_read_virt_blocks(
+ inode, (unsigned long long)v_block, nr, bhs, flags,
+ validate);
+
+ if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+ i_size_read(inode)) {
+ BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+ goto out;
+ }
- __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+ while (done < nr) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+ &p_block, &p_count, NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
- if (ent)
- ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
- le32_to_cpu(ent->e_rec.e_cpos));
+ if (!p_block) {
+ rc = -EIO;
+ mlog(ML_ERROR,
+ "Inode #%llu contains a hole at offset %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)(v_block + done) <<
+ inode->i_sb->s_blocksize_bits);
+ break;
+ }
- OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
+ count = nr - done;
+ if (p_count < count)
+ count = p_count;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ /*
+ * If the caller passed us bhs, they should have come
+ * from a previous readahead call to this function. Thus,
+ * they should have the right b_blocknr.
+ */
+ for (i = 0; i < count; i++) {
+ if (!bhs[done + i])
+ continue;
+ BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+ }
- if (free_head)
- __ocfs2_extent_map_drop_cleanup(free_head);
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
+ bhs + done, flags, validate);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+ done += count;
+ }
- return 0;
+out:
+ return rc;
}
-int __init init_ocfs2_extent_maps(void)
-{
- ocfs2_em_ent_cachep =
- kmem_cache_create("ocfs2_em_ent",
- sizeof(struct ocfs2_extent_map_entry),
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (!ocfs2_em_ent_cachep)
- return -ENOMEM;
- return 0;
-}
-
-void exit_ocfs2_extent_maps(void)
-{
- kmem_cache_destroy(ocfs2_em_ent_cachep);
-}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa88..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,68 @@
#ifndef _EXTENT_MAP_H
#define _EXTENT_MAP_H
-int init_ocfs2_extent_maps(void);
-void exit_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
+ unsigned int ei_cpos;
+ unsigned int ei_phys;
+ unsigned int ei_clusters;
+ unsigned int ei_flags;
+
+ struct list_head ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS 3
+struct ocfs2_extent_map {
+ unsigned int em_num_items;
+ struct list_head em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+ struct ocfs2_extent_rec *rec);
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+ u64 *ret_count, unsigned int *extent_flags);
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 map_start, u64 map_len);
+
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ struct ocfs2_extent_list *el,
+ unsigned int *extent_flags);
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters);
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
+{
+ int status = 0;
+
+ if (bh == NULL) {
+ printk("ocfs2: bh == NULL\n");
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+ return status;
+}
-/*
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
- * to be held. The allocation cannot change at all while the map is
- * in the process of being updated.
- */
-int ocfs2_extent_map_init(struct inode *inode);
-int ocfs2_extent_map_append(struct inode *inode,
- struct ocfs2_extent_rec *rec,
- u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
- u64 v_blkno, int count,
- u64 *p_blkno, int *ret_count);
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2bbfa17090c..2930e231f3f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -30,8 +30,14 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/quotaops.h>
+#include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -46,16 +52,45 @@
#include "inode.h"
#include "ioctl.h"
#include "journal.h"
+#include "locks.h"
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
+#include "xattr.h"
+#include "acl.h"
+#include "quota.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
- filemap_fdatawrite(inode->i_mapping);
- return sync_mapping_buffers(inode->i_mapping);
+ struct ocfs2_file_private *fp;
+
+ fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->fp_file = file;
+ mutex_init(&fp->fp_mutex);
+ ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+ file->private_data = fp;
+
+ return 0;
+}
+
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (fp) {
+ ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+ ocfs2_lock_res_free(&fp->fp_flock);
+ kfree(fp);
+ file->private_data = NULL;
+ }
}
static int ocfs2_file_open(struct inode *inode, struct file *file)
@@ -64,8 +99,13 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
int mode = file->f_flags;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
- file->f_dentry->d_name.len, file->f_dentry->d_name.name);
+ trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name, mode);
+
+ if (file->f_mode & FMODE_WRITE)
+ dquot_initialize(inode);
spin_lock(&oi->ip_lock);
@@ -84,9 +124,19 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
oi->ip_open_count++;
spin_unlock(&oi->ip_lock);
- status = 0;
+
+ status = ocfs2_init_file_private(inode, file);
+ if (status) {
+ /*
+ * We want to set open count back if we're failing the
+ * open.
+ */
+ spin_lock(&oi->ip_lock);
+ oi->ip_open_count--;
+ spin_unlock(&oi->ip_lock);
+ }
+
leave:
- mlog_exit(status);
return status;
}
@@ -94,55 +144,166 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
- file->f_dentry->d_name.len,
- file->f_dentry->d_name.name);
-
spin_lock(&oi->ip_lock);
if (!--oi->ip_open_count)
oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+ trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+ oi->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ oi->ip_open_count);
spin_unlock(&oi->ip_lock);
- mlog_exit(0);
+ ocfs2_free_file_private(inode, file);
return 0;
}
-static int ocfs2_sync_file(struct file *file,
- struct dentry *dentry,
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+ return ocfs2_init_file_private(inode, file);
+}
+
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+ ocfs2_free_file_private(inode, file);
+ return 0;
+}
+
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
int datasync)
{
int err = 0;
- journal_t *journal;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
+ int ret;
+ tid_t commit_tid;
+ bool needs_barrier = false;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+ OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ (unsigned long long)datasync);
- err = ocfs2_sync_inode(dentry->d_inode);
- if (err)
- goto bail;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
- journal = osb->journal->j_journal;
- err = journal_force_commit(journal);
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
+
+ commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ err = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!err)
+ err = ret;
+ }
-bail:
- mlog_exit(err);
+ if (err)
+ mlog_errno(err);
return (err < 0) ? -EIO : 0;
}
-int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 new_i_size)
+int ocfs2_should_update_atime(struct inode *inode,
+ struct vfsmount *vfsmnt)
+{
+ struct timespec now;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return 0;
+
+ if ((inode->i_flags & S_NOATIME) ||
+ ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ return 0;
+
+ /*
+ * We can be called with no vfsmnt structure - NFSD will
+ * sometimes do this.
+ *
+ * Note that our action here is different than touch_atime() -
+ * if we can't tell whether this is a noatime mount, then we
+ * don't know whether to trust the value of s_atime_quantum.
+ */
+ if (vfsmnt == NULL)
+ return 0;
+
+ if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
+ ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
+ return 0;
+
+ if (vfsmnt->mnt_flags & MNT_RELATIME) {
+ if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
+ (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+ return 1;
+
+ return 0;
+ }
+
+ now = CURRENT_TIME;
+ if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+ return 0;
+ else
+ return 1;
+}
+
+int ocfs2_update_inode_atime(struct inode *inode,
+ struct buffer_head *bh)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * Don't use ocfs2_mark_inode_dirty() here as we don't always
+ * have i_mutex to guard against concurrent changes to other
+ * inode fields.
+ */
+ inode->i_atime = CURRENT_TIME;
+ di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+ di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+ ocfs2_journal_dirty(handle, bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_set_inode_size(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 new_i_size)
{
int status;
- mlog_entry_void();
i_size_write(inode, new_i_size);
- inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -152,22 +313,20 @@ int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
}
bail:
- mlog_exit(status);
return status;
}
-static int ocfs2_simple_size_update(struct inode *inode,
- struct buffer_head *di_bh,
- u64 new_i_size)
+int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
- handle = ocfs2_start_trans(osb, NULL,
- OCFS2_INODE_UPDATE_CREDITS);
- if (handle == NULL) {
- ret = -ENOMEM;
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
@@ -177,38 +336,108 @@ static int ocfs2_simple_size_update(struct inode *inode,
if (ret < 0)
mlog_errno(ret);
- ocfs2_commit_trans(handle);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+ ocfs2_commit_trans(osb, handle);
out:
return ret;
}
+static int ocfs2_cow_file_pos(struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 offset)
+{
+ int status;
+ u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ /*
+ * If the new offset is aligned to the range of the cluster, there is
+ * no space for ocfs2_zero_range_for_truncate to fill, so no need to
+ * CoW either.
+ */
+ if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
+ return 0;
+
+ status = ocfs2_get_clusters(inode, cpos, &phys,
+ &num_clusters, &ext_flags);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+
+out:
+ return status;
+}
+
static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size)
{
int status;
- struct ocfs2_journal_handle *handle;
+ handle_t *handle;
+ struct ocfs2_dinode *di;
+ u64 cluster_bytes;
- mlog_entry_void();
+ /*
+ * We need to CoW the cluster contains the offset if it is reflinked
+ * since we will call ocfs2_zero_range_for_truncate later which will
+ * write "0" from offset to the end of the cluster.
+ */
+ status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
/* TODO: This needs to actually orphan the inode in this
* transaction. */
- handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out;
}
- status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
- if (status < 0)
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ /*
+ * Do this before setting i_size.
+ */
+ cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+ status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+ cluster_bytes);
+ if (status) {
mlog_errno(status);
+ goto out_commit;
+ }
+
+ i_size_write(inode, new_i_size);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+ di = (struct ocfs2_dinode *) fe_bh->b_data;
+ di->i_size = cpu_to_le64(new_i_size);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+ ocfs2_journal_dirty(handle, fe_bh);
- ocfs2_commit_trans(handle);
+out_commit:
+ ocfs2_commit_trans(osb, handle);
out:
- mlog_exit(status);
return status;
}
@@ -219,20 +448,14 @@ static int ocfs2_truncate_file(struct inode *inode,
int status = 0;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_truncate_context *tc = NULL;
-
- mlog_entry("(inode = %llu, new_i_size = %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)new_i_size);
-
- truncate_inode_pages(inode->i_mapping, new_i_size);
+ /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+ * already validated it */
fe = (struct ocfs2_dinode *) di_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ (unsigned long long)new_i_size);
mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
"Inode %llu, inode i_size = %lld != di "
@@ -243,45 +466,36 @@ static int ocfs2_truncate_file(struct inode *inode,
le32_to_cpu(fe->i_flags));
if (new_i_size > le64_to_cpu(fe->i_size)) {
- mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
- (unsigned long long)le64_to_cpu(fe->i_size),
- (unsigned long long)new_i_size);
+ trace_ocfs2_truncate_file_error(
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ (unsigned long long)new_i_size);
status = -EINVAL;
mlog_errno(status);
goto bail;
}
- mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- (unsigned long long)le64_to_cpu(fe->i_size),
- (unsigned long long)new_i_size);
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
- /* lets handle the simple truncate cases before doing any more
- * cluster locking. */
- if (new_i_size == le64_to_cpu(fe->i_size))
- goto bail;
+ ocfs2_resv_discard(&osb->osb_la_resmap,
+ &OCFS2_I(inode)->ip_la_data_resv);
- /* This forces other nodes to sync and drop their pages. Do
- * this even if we have a truncate without allocation change -
- * ocfs2 cluster sizes can be much greater than page size, so
- * we have to truncate them anyway. */
- status = ocfs2_data_lock(inode, 1);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- ocfs2_data_unlock(inode, 1);
+ /*
+ * The inode lock forced other nodes to sync and drop their
+ * pages, which (correctly) happens even if we have a truncate
+ * without allocation change - ocfs2 cluster sizes can be much
+ * greater than page size, so we have to truncate them
+ * anyway.
+ */
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
- if (le32_to_cpu(fe->i_clusters) ==
- ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
- mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
- fe->i_clusters);
- /* No allocation change is required, so lets fast path
- * this truncate. */
- status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
- if (status < 0)
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
+ i_size_read(inode), 1);
+ if (status)
mlog_errno(status);
- goto bail;
+
+ goto bail_unlock_sem;
}
/* alright, we're going to need to do a full blown alloc size
@@ -291,30 +505,28 @@ static int ocfs2_truncate_file(struct inode *inode,
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ goto bail_unlock_sem;
}
- status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_sem;
}
/* TODO: orphan dir cleanup here. */
+bail_unlock_sem:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
bail:
+ if (!status && OCFS2_I(inode)->ip_clusters == 0)
+ status = ocfs2_try_remove_refcount_tree(inode, di_bh);
- mlog_exit(status);
return status;
}
/*
- * extend allocation only here.
+ * extend file allocation only here.
* we'll update all the disk stuff, and oip->alloc_size
*
* expect stuff to be locked, a transaction started and enough data /
@@ -323,192 +535,71 @@ bail:
* Will return -EAGAIN, and a reason if a restart is needed.
* If passed in, *reason will always be set, even in error.
*/
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
- struct inode *inode,
- u32 clusters_to_add,
- struct buffer_head *fe_bh,
- struct ocfs2_journal_handle *handle,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct buffer_head *fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
{
- int status = 0;
- int free_extents;
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
- enum ocfs2_alloc_restarted reason = RESTART_NONE;
- u32 bit_off, num_bits;
- u64 block;
-
- BUG_ON(!clusters_to_add);
-
- free_extents = ocfs2_num_free_extents(osb, inode, fe);
- if (free_extents < 0) {
- status = free_extents;
- mlog_errno(status);
- goto leave;
- }
-
- /* there are two cases which could cause us to EAGAIN in the
- * we-need-more-metadata case:
- * 1) we haven't reserved *any*
- * 2) we are so fragmented, we've needed to add metadata too
- * many times. */
- if (!free_extents && !meta_ac) {
- mlog(0, "we haven't reserved any metadata!\n");
- status = -EAGAIN;
- reason = RESTART_META;
- goto leave;
- } else if ((!free_extents)
- && (ocfs2_alloc_context_bits_left(meta_ac)
- < ocfs2_extend_meta_needed(fe))) {
- mlog(0, "filesystem is really fragmented...\n");
- status = -EAGAIN;
- reason = RESTART_META;
- goto leave;
- }
-
- status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
- &bit_off, &num_bits);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
-
- BUG_ON(num_bits > clusters_to_add);
-
- /* reserve our write early -- insert_extent may update the inode */
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
- num_bits, meta_ac);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- le32_add_cpu(&fe->i_clusters, num_bits);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- clusters_to_add -= num_bits;
+ int ret;
+ struct ocfs2_extent_tree et;
- if (clusters_to_add) {
- mlog(0, "need to alloc once more, clusters = %u, wanted = "
- "%u\n", fe->i_clusters, clusters_to_add);
- status = -EAGAIN;
- reason = RESTART_TRANS;
- }
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
+ ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
-leave:
- mlog_exit(status);
- if (reason_ret)
- *reason_ret = reason;
- return status;
+ return ret;
}
-static int ocfs2_extend_allocation(struct inode *inode,
- u32 clusters_to_add)
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
{
int status = 0;
int restart_func = 0;
- int drop_alloc_sem = 0;
- int credits, num_free_extents;
+ int credits;
u32 prev_clusters;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *fe = NULL;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_tree et;
+ int did_quota = 0;
- mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+ /*
+ * Unwritten extent only exists for file systems which
+ * support holes.
+ */
+ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
-
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto leave;
- }
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
- "clusters_to_add = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
- fe->i_clusters, clusters_to_add);
-
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto leave;
- }
-
- num_free_extents = ocfs2_num_free_extents(osb,
- inode,
- fe);
- if (num_free_extents < 0) {
- status = num_free_extents;
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
+ status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
+ if (status) {
mlog_errno(status);
goto leave;
}
- if (!num_free_extents) {
- status = ocfs2_reserve_new_metadata(osb,
- handle,
- fe,
- &meta_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
- }
-
- status = ocfs2_reserve_clusters(osb,
- handle,
- clusters_to_add,
- &data_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
-
- /* blocks peope in read/write from reading our allocation
- * until we're done changing it. We depend on i_mutex to block
- * other extend/truncate calls while we're here. Ordering wrt
- * start_trans is important here -- always do it before! */
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- drop_alloc_sem = 1;
-
- credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
- handle = ocfs2_start_trans(osb, handle, credits);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
+ handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -517,11 +608,23 @@ restart_all:
}
restarted_transaction:
+ trace_ocfs2_extend_allocation(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)i_size_read(inode),
+ le32_to_cpu(fe->i_clusters), clusters_to_add,
+ why, restart_func);
+
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ if (status)
+ goto leave;
+ did_quota = 1;
+
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path, we can still
* update i_size. */
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -529,43 +632,40 @@ restarted_transaction:
prev_clusters = OCFS2_I(inode)->ip_clusters;
- status = ocfs2_do_extend_allocation(osb,
- inode,
- clusters_to_add,
- bh,
- handle,
- data_ac,
- meta_ac,
- &why);
+ status = ocfs2_add_inode_data(osb,
+ inode,
+ &logical_start,
+ clusters_to_add,
+ mark_unwritten,
+ bh,
+ handle,
+ data_ac,
+ meta_ac,
+ &why);
if ((status < 0) && (status != -EAGAIN)) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(inode)->ip_lock);
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
+ /* Release unused quota reservation */
+ dquot_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ did_quota = 0;
if (why != RESTART_NONE && clusters_to_add) {
if (why == RESTART_META) {
- mlog(0, "restarting function.\n");
restart_func = 1;
+ status = 0;
} else {
BUG_ON(why != RESTART_TRANS);
- mlog(0, "restarting transaction.\n");
- /* TODO: This can be more intelligent. */
- credits = ocfs2_calc_extend_credits(osb->sb,
- fe,
- clusters_to_add);
- status = ocfs2_extend_trans(handle, credits);
+ status = ocfs2_allocate_extend_trans(handle, 1);
if (status < 0) {
/* handle still has to be committed at
* this point. */
@@ -577,18 +677,18 @@ restarted_transaction:
}
}
- mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
- fe->i_clusters, (unsigned long long)fe->i_size);
- mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
- OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+ trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
+ le32_to_cpu(fe->i_clusters),
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ OCFS2_I(inode)->ip_clusters,
+ (unsigned long long)i_size_read(inode));
leave:
- if (drop_alloc_sem) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- drop_alloc_sem = 0;
- }
+ if (status < 0 && did_quota)
+ dquot_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (handle) {
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
handle = NULL;
}
if (data_ac) {
@@ -603,71 +703,145 @@ leave:
restart_func = 0;
goto restart_all;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
+ brelse(bh);
+ bh = NULL;
- mlog_exit(status);
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (!ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
- * ->commit_write(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
- struct ocfs2_journal_handle *handle = NULL;
- int ret;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned zero_from, zero_to, block_start, block_end;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ trace_ocfs2_write_zero_page(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)abs_from,
+ (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from; block_start < zero_to;
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to force
+ * __block_write_begin and block_commit_write to zero the
+ * whole block.
+ */
+ ret = __block_write_begin(page, block_start + 1, 0,
+ ocfs2_get_block);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
+
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode,
+ di_bh);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
+
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
}
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (handle) {
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_page().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ di->i_mtime_nsec = di->i_ctime_nsec;
+ ocfs2_journal_dirty(handle, di_bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ }
- if (handle)
- ocfs2_commit_trans(handle);
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -675,97 +849,269 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
+{
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
+ zero_clusters, UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end, struct buffer_head *di_bh)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ trace_ocfs2_zero_extend_range(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
+
+ /*
+ * Very large extends have the potential to lock up
+ * the cpu for extended periods of time.
+ */
+ cond_resched();
+ }
+
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
{
int ret = 0;
- u64 start_off;
+ u64 zero_start, range_start = 0, range_end = 0;
struct super_block *sb = inode->i_sb;
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
- if (ret < 0) {
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
mlog_errno(ret);
- goto out;
+ break;
}
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
+ return ret;
+}
- start_off += sb->s_blocksize;
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
+{
+ int ret;
+ u32 clusters_to_add;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
+ clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
+ if (clusters_to_add < oi->ip_clusters)
+ clusters_to_add = 0;
+ else
+ clusters_to_add -= oi->ip_clusters;
+
+ if (clusters_to_add) {
+ ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
+ clusters_to_add, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
+ /*
+ * Call this even if we don't add any clusters to the tree. We
+ * still need to zero the area between the old i_size and the
+ * new i_size.
+ */
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
+ if (ret < 0)
+ mlog_errno(ret);
+
out:
return ret;
}
-/*
- * A tail_to_skip value > 0 indicates that we're being called from
- * ocfs2_file_aio_write(). This has the following implications:
- *
- * - we don't want to update i_size
- * - di_bh will be NULL, which is fine because it's only used in the
- * case where we want to update i_size.
- * - ocfs2_zero_extend() will then only be filling the hole created
- * between i_size and the start of the write.
- */
static int ocfs2_extend_file(struct inode *inode,
struct buffer_head *di_bh,
- u64 new_i_size,
- size_t tail_to_skip)
+ u64 new_i_size)
{
int ret = 0;
- u32 clusters_to_add;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
- BUG_ON(!tail_to_skip && !di_bh);
+ BUG_ON(!di_bh);
/* setattr sometimes calls us like this. */
if (new_i_size == 0)
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
- clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
- OCFS2_I(inode)->ip_clusters;
+ /*
+ * The alloc sem blocks people in read/write from reading our
+ * allocation until we're done changing it. We depend on
+ * i_mutex to block other extend/truncate calls while we're
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
+ */
+ down_write(&oi->ip_alloc_sem);
- if (clusters_to_add) {
- /*
- * protect the pages that ocfs2_zero_extend is going to
- * be pulling into the page cache.. we do this before the
- * metadata extend so that we don't get into the situation
- * where we've extended the metadata but can't get the data
- * lock to zero.
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ /*
+ * We can optimize small extends by keeping the inodes
+ * inline data.
*/
- ret = ocfs2_data_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
+ if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
+ up_write(&oi->ip_alloc_sem);
+ goto out_update_size;
}
- ret = ocfs2_extend_allocation(inode, clusters_to_add);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
- ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
- if (ret < 0) {
+ ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+ if (ret) {
+ up_write(&oi->ip_alloc_sem);
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
}
- if (!tail_to_skip) {
- /* We're being called from ocfs2_setattr() which wants
- * us to update i_size */
- ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
- if (ret < 0)
- mlog_errno(ret);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
+
+ up_write(&oi->ip_alloc_sem);
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
-out_unlock:
- if (clusters_to_add) /* this is the only case in which we lock */
- ocfs2_data_unlock(inode, 1);
+out_update_size:
+ ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+ if (ret < 0)
+ mlog_errno(ret);
out:
return ret;
@@ -778,33 +1124,32 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
- struct ocfs2_journal_handle *handle = NULL;
-
- mlog_entry("(0x%p, '%.*s')\n", dentry,
- dentry->d_name.len, dentry->d_name.name);
-
- if (attr->ia_valid & ATTR_MODE)
- mlog(0, "mode change: %d\n", attr->ia_mode);
- if (attr->ia_valid & ATTR_UID)
- mlog(0, "uid change: %d\n", attr->ia_uid);
- if (attr->ia_valid & ATTR_GID)
- mlog(0, "gid change: %d\n", attr->ia_gid);
- if (attr->ia_valid & ATTR_SIZE)
- mlog(0, "size change...\n");
- if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
- mlog(0, "time change...\n");
+ handle_t *handle = NULL;
+ struct dquot *transfer_to[MAXQUOTAS] = { };
+ int qtype;
+
+ trace_ocfs2_setattr(inode, dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ dentry->d_name.len, dentry->d_name.name,
+ attr->ia_valid, attr->ia_mode,
+ from_kuid(&init_user_ns, attr->ia_uid),
+ from_kgid(&init_user_ns, attr->ia_gid));
+
+ /* ensuring we don't even attempt to truncate a symlink */
+ if (S_ISLNK(inode->i_mode))
+ attr->ia_valid &= ~ATTR_SIZE;
#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
| ATTR_GID | ATTR_UID | ATTR_MODE)
- if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
- mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+ if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
return 0;
- }
status = inode_change_ok(inode, attr);
if (status)
return status;
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
status = ocfs2_rw_lock(inode, 1);
@@ -814,18 +1159,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+ status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail_unlock_rw;
}
- if (size_change && attr->ia_size != i_size_read(inode)) {
- if (i_size_read(inode) > attr->ia_size)
+ if (size_change) {
+ status = inode_newsize_ok(inode, attr->ia_size);
+ if (status)
+ goto bail_unlock;
+
+ inode_dio_wait(inode);
+
+ if (i_size_read(inode) >= attr->ia_size) {
+ if (ocfs2_should_order_data(inode)) {
+ status = ocfs2_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (status)
+ goto bail_unlock;
+ }
status = ocfs2_truncate_file(inode, bh, attr->ia_size);
- else
- status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
+ } else
+ status = ocfs2_extend_file(inode, bh, attr->ia_size);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -834,35 +1191,77 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
+ if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+ /*
+ * Gather pointers to quota structures so that allocation /
+ * freeing of quota structures happens here and not inside
+ * dquot_transfer() where we have problems with lock ordering
+ */
+ if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
+ && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
+ if (!transfer_to[USRQUOTA]) {
+ status = -ESRCH;
+ goto bail_unlock;
+ }
+ }
+ if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
+ && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
+ if (!transfer_to[GRPQUOTA]) {
+ status = -ESRCH;
+ goto bail_unlock;
+ }
+ }
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+ 2 * ocfs2_quota_trans_credits(sb));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
+ status = __dquot_transfer(inode, transfer_to);
+ if (status < 0)
+ goto bail_commit;
+ } else {
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
}
- status = inode_setattr(inode, attr);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
if (status < 0)
mlog_errno(status);
bail_commit:
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
+
+ /* Release quota pointers in case we acquired them */
+ for (qtype = 0; qtype < MAXQUOTAS; qtype++)
+ dqput(transfer_to[qtype]);
+
+ if (!status && attr->ia_valid & ATTR_MODE) {
+ status = posix_acl_chmod(inode, inode->i_mode);
+ if (status < 0)
+ mlog_errno(status);
+ }
- mlog_exit(status);
return status;
}
@@ -875,8 +1274,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
struct ocfs2_super *osb = sb->s_fs_info;
int err;
- mlog_entry_void();
-
err = ocfs2_inode_revalidate(dentry);
if (err) {
if (err != -ENOENT)
@@ -890,127 +1287,836 @@ int ocfs2_getattr(struct vfsmount *mnt,
stat->blksize = osb->s_clustersize;
bail:
- mlog_exit(err);
-
return err;
}
-static int ocfs2_write_remove_suid(struct inode *inode)
+int ocfs2_permission(struct inode *inode, int mask)
{
int ret;
- struct buffer_head *bh = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_journal_handle *handle;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = generic_permission(inode, mask);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ return ret;
+}
+
+static int __ocfs2_write_remove_suid(struct inode *inode,
+ struct buffer_head *bh)
+{
+ int ret;
+ handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;
- mlog_entry("(Inode %llu, mode 0%o)\n",
- (unsigned long long)oi->ip_blkno, inode->i_mode);
+ trace_ocfs2_write_remove_suid(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ inode->i_mode);
- handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
- if (handle == NULL) {
- ret = -ENOMEM;
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
- ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_trans;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_bh;
- }
-
inode->i_mode &= ~S_ISUID;
if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
inode->i_mode &= ~S_ISGID;
di = (struct ocfs2_dinode *) bh->b_data;
di->i_mode = cpu_to_le16(inode->i_mode);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
+ ocfs2_journal_dirty(handle, bh);
+
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+ size_t count)
+{
+ int ret = 0;
+ unsigned int extent_flags;
+ u32 cpos, clusters, extent_len, phys_cpos;
+ struct super_block *sb = inode->i_sb;
+
+ cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+ &extent_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = 1;
+ break;
+ }
+
+ if (extent_len > clusters)
+ extent_len = clusters;
+
+ clusters -= extent_len;
+ cpos += extent_len;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+ int ret;
+ struct buffer_head *bh = NULL;
+
+ ret = ocfs2_read_inode_block(inode, &bh);
+ if (ret < 0) {
mlog_errno(ret);
-out_bh:
+ goto out;
+ }
+
+ ret = __ocfs2_write_remove_suid(inode, bh);
+out:
brelse(bh);
-out_trans:
- ocfs2_commit_trans(handle);
+ return ret;
+}
+
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u32 cpos, phys_cpos, clusters, alloc_size;
+ u64 end = start + len;
+ struct buffer_head *di_bh = NULL;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Nothing to do if the requested reservation range
+ * fits within the inode.
+ */
+ if (ocfs2_size_fits_inline_data(di_bh, end))
+ goto out;
+
+ ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * We consider both start and len to be inclusive.
+ */
+ cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+ clusters -= cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+ &alloc_size, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Hole or existing extent len can be arbitrary, so
+ * cap it to our own allocation request.
+ */
+ if (alloc_size > clusters)
+ alloc_size = clusters;
+
+ if (phys_cpos) {
+ /*
+ * We already have an allocation at this
+ * region so we can safely skip it.
+ */
+ goto next;
+ }
+
+ ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+next:
+ cpos += alloc_size;
+ clusters -= alloc_size;
+ }
+
+ ret = 0;
out:
- mlog_exit(ret);
+
+ brelse(di_bh);
return ret;
}
-static inline int ocfs2_write_should_remove_suid(struct inode *inode)
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+ u64 byte_len)
{
- mode_t mode = inode->i_mode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t start, end;
+ struct address_space *mapping = inode->i_mapping;
- if (!capable(CAP_FSETID)) {
- if (unlikely(mode & S_ISUID))
- return 1;
+ start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+ end = byte_start + byte_len;
+ end = end & ~(osb->s_clustersize - 1);
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- return 1;
+ if (start < end) {
+ unmap_mapping_range(mapping, start, end - start, 0);
+ truncate_inode_pages_range(mapping, start, end - 1);
}
- return 0;
}
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const char __user *buf,
- size_t count,
- loff_t pos)
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+ u64 start, u64 len)
{
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
- int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
- u32 clusters;
- struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_dentry->d_inode;
- loff_t newsize, saved_pos;
+ int ret = 0;
+ u64 tmpend, end = start + len;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int csize = osb->s_clustersize;
+ handle_t *handle;
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
- (unsigned int)count,
- filp->f_dentry->d_name.len,
- filp->f_dentry->d_name.name);
+ /*
+ * The "start" and "end" values are NOT necessarily part of
+ * the range whose allocation is being deleted. Rather, this
+ * is what the user passed in with the request. We must zero
+ * partial clusters here. There's no need to worry about
+ * physical allocation - the zeroing code knows to skip holes.
+ */
+ trace_ocfs2_zero_partial_clusters(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)start, (unsigned long long)end);
+
+ /*
+ * If both edges are on a cluster boundary then there's no
+ * zeroing required as the region is part of the allocation to
+ * be truncated.
+ */
+ if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We want to get the byte offset of the end of the 1st cluster.
+ */
+ tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+ if (tmpend > end)
+ tmpend = end;
+
+ trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+ (unsigned long long)tmpend);
+
+ ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+ if (ret)
+ mlog_errno(ret);
+
+ if (tmpend < end) {
+ /*
+ * This may make start and end equal, but the zeroing
+ * code will skip any work in that case so there's no
+ * need to catch it up here.
+ */
+ start = end & ~(osb->s_clustersize - 1);
+
+ trace_ocfs2_zero_partial_clusters_range2(
+ (unsigned long long)start, (unsigned long long)end);
+
+ ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+ if (ret)
+ mlog_errno(ret);
+ }
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+ int i;
+ struct ocfs2_extent_rec *rec = NULL;
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) < pos)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *rec,
+ u32 trunc_start, u32 *trunc_cpos,
+ u32 *trunc_len, u32 *trunc_end,
+ u64 *blkno, int *done)
+{
+ int ret = 0;
+ u32 coff, range;
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+ /*
+ * remove an entire extent record.
+ */
+ *trunc_cpos = le32_to_cpu(rec->e_cpos);
+ /*
+ * Skip holes if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno);
+ *trunc_end = le32_to_cpu(rec->e_cpos);
+ } else if (range > trunc_start) {
+ /*
+ * remove a partial extent record, which means we're
+ * removing the last extent record.
+ */
+ *trunc_cpos = trunc_start;
+ /*
+ * skip hole if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - trunc_start;
+ coff = trunc_start - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
+ *trunc_end = trunc_start;
+ } else {
+ /*
+ * It may have two following possibilities:
+ *
+ * - last record has been removed
+ * - trunc_start was within a hole
+ *
+ * both two cases mean the completion of hole punching.
+ */
+ ret = 1;
+ }
+
+ *done = ret;
+}
+
+static int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
+{
+ int ret = 0, flags = 0, done = 0, i;
+ u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+ u32 cluster_in_el;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct address_space *mapping = inode->i_mapping;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el = NULL;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ trace_ocfs2_remove_inode_range(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)byte_start,
+ (unsigned long long)byte_len);
- /* happy write of zero bytes */
- if (count == 0)
+ if (byte_len == 0)
return 0;
- if (!inode) {
- mlog(0, "bad inode\n");
- return -EIO;
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
+ byte_start + byte_len, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ /*
+ * There's no need to get fancy with the page cache
+ * truncate of an inline-data inode. We're talking
+ * about less than a page here, which will be cached
+ * in the dinode buffer anyway.
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+ truncate_inode_pages(mapping, 0);
+ goto out;
+ }
+
+ /*
+ * For reflinks, we may need to CoW 2 clusters which might be
+ * partially zero'd later, if hole's start and end offset were
+ * within one cluster(means is not exactly aligned to clustersize).
+ */
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+ trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+ cluster_in_el = trunc_end;
+
+ ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
+ while (trunc_end > trunc_start) {
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path,
+ cluster_in_el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ i = ocfs2_find_rec(el, trunc_end);
+ /*
+ * Need to go to previous extent block.
+ */
+ if (i < 0) {
+ if (path->p_tree_depth == 0)
+ break;
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ path,
+ &cluster_in_el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We've reached the leftmost extent block,
+ * it's safe to leave.
+ */
+ if (cluster_in_el == 0)
+ break;
+
+ /*
+ * The 'pos' searched for previous extent block is
+ * always one cluster less than actual trunc_end.
+ */
+ trunc_end = cluster_in_el + 1;
+
+ ocfs2_reinit_path(path, 1);
+
+ continue;
+
+ } else
+ rec = &el->l_recs[i];
+
+ ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+ &trunc_len, &trunc_end, &blkno, &done);
+ if (done)
+ break;
+
+ flags = rec->e_flags;
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+ ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags,
+ &dealloc, refcount_loc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cluster_in_el = trunc_end;
+
+ ocfs2_reinit_path(path, 1);
+ }
+
+ ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+
+out:
+ ocfs2_free_path(path);
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+/*
+ * Parts of this function taken from xfs_change_file_space()
+ */
+static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
+ loff_t f_pos, unsigned int cmd,
+ struct ocfs2_space_resv *sr,
+ int change_size)
+{
+ int ret;
+ s64 llen;
+ loff_t size;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ handle_t *handle;
+ unsigned long long max_off = inode->i_sb->s_maxbytes;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
mutex_lock(&inode->i_mutex);
- /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
- if (filp->f_flags & O_DIRECT) {
- have_alloc_sem = 1;
- down_read(&inode->i_alloc_sem);
+
+ /*
+ * This prevents concurrent writes on other nodes
+ */
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- /* concurrent O_DIRECT writes are allowed */
- rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
- ret = ocfs2_rw_lock(inode, rw_level);
- if (ret < 0) {
- rw_level = -1;
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_rw_unlock;
+ }
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ ret = -EPERM;
+ goto out_inode_unlock;
+ }
+
+ switch (sr->l_whence) {
+ case 0: /*SEEK_SET*/
+ break;
+ case 1: /*SEEK_CUR*/
+ sr->l_start += f_pos;
+ break;
+ case 2: /*SEEK_END*/
+ sr->l_start += i_size_read(inode);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out_inode_unlock;
+ }
+ sr->l_whence = 0;
+
+ llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
+
+ if (sr->l_start < 0
+ || sr->l_start > max_off
+ || (sr->l_start + llen) < 0
+ || (sr->l_start + llen) > max_off) {
+ ret = -EINVAL;
+ goto out_inode_unlock;
+ }
+ size = sr->l_start + sr->l_len;
+
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+ cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
+ if (sr->l_len <= 0) {
+ ret = -EINVAL;
+ goto out_inode_unlock;
+ }
+ }
+
+ if (file && should_remove_suid(file->f_path.dentry)) {
+ ret = __ocfs2_write_remove_suid(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_inode_unlock;
+ }
+ }
+
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ switch (cmd) {
+ case OCFS2_IOC_RESVSP:
+ case OCFS2_IOC_RESVSP64:
+ /*
+ * This takes unsigned offsets, but the signed ones we
+ * pass have been checked against overflow above.
+ */
+ ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
+ sr->l_len);
+ break;
+ case OCFS2_IOC_UNRESVSP:
+ case OCFS2_IOC_UNRESVSP64:
+ ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
+ sr->l_len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_inode_unlock;
+ }
+
+ /*
+ * We update c/mtime for these changes
+ */
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_inode_unlock;
+ }
+
+ if (change_size && i_size_read(inode) < size)
+ i_size_write(inode, size);
+
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ if (file && (file->f_flags & O_SYNC))
+ handle->h_sync = 1;
+
+ ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+ ocfs2_rw_unlock(inode, 1);
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+ struct ocfs2_space_resv *sr)
+{
+ struct inode *inode = file_inode(file);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
+
+ if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
+ !ocfs2_writes_unwritten_extents(osb))
+ return -ENOTTY;
+ else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
+ !ocfs2_sparse_alloc(osb))
+ return -ENOTTY;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+ ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_space_resv sr;
+ int change_size = 1;
+ int cmd = OCFS2_IOC_RESVSP64;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+ if (!ocfs2_writes_unwritten_extents(osb))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ change_size = 0;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = OCFS2_IOC_UNRESVSP64;
+
+ sr.l_whence = 0;
+ sr.l_start = (s64)offset;
+ sr.l_len = (s64)len;
+
+ return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+ change_size);
+}
+
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+ size_t count)
+{
+ int ret = 0;
+ unsigned int extent_flags;
+ u32 cpos, clusters, extent_len, phys_cpos;
+ struct super_block *sb = inode->i_sb;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
+ !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+ OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+ &extent_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = 1;
+ break;
+ }
+
+ if (extent_len > clusters)
+ extent_len = clusters;
+
+ clusters -= extent_len;
+ cpos += extent_len;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
+}
+
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+ struct file *file,
+ loff_t pos, size_t count,
+ int *meta_level)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
mlog_errno(ret);
goto out;
}
- /*
- * We sample i_size under a read level meta lock to see if our write
- * is extending the file, if it is we back off and get a write level
- * meta lock.
+ *meta_level = 1;
+
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(di_bh);
+ return ret;
+}
+
+static int ocfs2_prepare_inode_for_write(struct file *file,
+ loff_t *ppos,
+ size_t count,
+ int appending,
+ int *direct_io,
+ int *has_refcount)
+{
+ int ret = 0, meta_level = 0;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ loff_t saved_pos = 0, end;
+
+ /*
+ * We start with a read level meta lock and only jump to an ex
+ * if we need to make modifications here.
*/
- meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
for(;;) {
- ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
if (ret < 0) {
meta_level = -1;
mlog_errno(ret);
@@ -1022,13 +2128,13 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
* remove_suid() calls ->setattr without any hint that
* we may have already done our cluster locking. Since
* ocfs2_setattr() *must* take cluster locks to
- * proceeed, this will lead us to recursively lock the
+ * proceed, this will lead us to recursively lock the
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
- if (ocfs2_write_should_remove_suid(inode)) {
+ if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_meta_unlock(inode, meta_level);
+ ocfs2_inode_unlock(inode, meta_level);
meta_level = 1;
continue;
}
@@ -1036,114 +2142,337 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
ret = ocfs2_write_remove_suid(inode);
if (ret < 0) {
mlog_errno(ret);
- goto out;
+ goto out_unlock;
}
}
/* work on a copy of ppos until we're sure that we won't have
* to recalculate it due to relocking. */
- if (filp->f_flags & O_APPEND) {
+ if (appending)
saved_pos = i_size_read(inode);
- mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
- } else {
- saved_pos = iocb->ki_pos;
+ else
+ saved_pos = *ppos;
+
+ end = saved_pos + count;
+
+ ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+ if (ret == 1) {
+ ocfs2_inode_unlock(inode, meta_level);
+ meta_level = -1;
+
+ ret = ocfs2_prepare_inode_for_refcount(inode,
+ file,
+ saved_pos,
+ count,
+ &meta_level);
+ if (has_refcount)
+ *has_refcount = 1;
+ if (direct_io)
+ *direct_io = 0;
}
- newsize = count + saved_pos;
- mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
- (long long) saved_pos, (long long) newsize,
- (long long) i_size_read(inode));
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
- /* No need for a higher level metadata lock if we're
- * never going past i_size. */
- if (newsize <= i_size_read(inode))
+ /*
+ * Skip the O_DIRECT checks if we don't need
+ * them.
+ */
+ if (!direct_io || !(*direct_io))
break;
- if (meta_level == 0) {
- ocfs2_meta_unlock(inode, meta_level);
- meta_level = 1;
- continue;
+ /*
+ * There's no sane way to do direct writes to an inode
+ * with inline data.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ *direct_io = 0;
+ break;
}
- spin_lock(&OCFS2_I(inode)->ip_lock);
- clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
- OCFS2_I(inode)->ip_clusters;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ /*
+ * Allowing concurrent direct writes means
+ * i_size changes wouldn't be synchronized, so
+ * one node could wind up truncating another
+ * nodes writes.
+ */
+ if (end > i_size_read(inode)) {
+ *direct_io = 0;
+ break;
+ }
- mlog(0, "Writing at EOF, may need more allocation: "
- "i_size = %lld, newsize = %lld, need %u clusters\n",
- (long long) i_size_read(inode), (long long) newsize,
- clusters);
+ /*
+ * We don't fill holes during direct io, so
+ * check for them here. If any are found, the
+ * caller will have to retake some cluster
+ * locks and initiate the io as buffered.
+ */
+ ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
+ if (ret == 1) {
+ *direct_io = 0;
+ ret = 0;
+ } else if (ret < 0)
+ mlog_errno(ret);
+ break;
+ }
- /* We only want to continue the rest of this loop if
- * our extend will actually require more
- * allocation. */
- if (!clusters)
- break;
+ if (appending)
+ *ppos = saved_pos;
+
+out_unlock:
+ trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+ saved_pos, appending, count,
+ direct_io, has_refcount);
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
+
+out:
+ return ret;
+}
+
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
+ int can_do_direct, has_refcount = 0;
+ ssize_t written = 0;
+ size_t count = iov_iter_count(from);
+ loff_t old_size, *ppos = &iocb->ki_pos;
+ u32 old_clusters;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
+
+ trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ (unsigned int)from->nr_segs); /* GRRRRR */
+
+ if (iocb->ki_nbytes == 0)
+ return 0;
+
+ appending = file->f_flags & O_APPEND ? 1 : 0;
+ direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+relock:
+ /* to match setattr's i_mutex -> rw_lock ordering */
+ if (direct_io) {
+ have_alloc_sem = 1;
+ /* communicate with ocfs2_dio_end_io */
+ ocfs2_iocb_set_sem_locked(iocb);
+ }
- ret = ocfs2_extend_file(inode, NULL, newsize, count);
+ /*
+ * Concurrent O_DIRECT writes are allowed with
+ * mount_option "coherency=buffered".
+ */
+ rw_level = (!direct_io || full_coherency);
+
+ ret = ocfs2_rw_lock(inode, rw_level);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_sems;
+ }
+
+ /*
+ * O_DIRECT writes with "coherency=full" need to take EX cluster
+ * inode_lock to guarantee coherency.
+ */
+ if (direct_io && full_coherency) {
+ /*
+ * We need to take and drop the inode lock to force
+ * other nodes to drop their caches. Buffered I/O
+ * already does this in write_begin().
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
+ mlog_errno(ret);
goto out;
}
- break;
+
+ ocfs2_inode_unlock(inode, 1);
}
- /* ok, we're done with i_size and alloc work */
- iocb->ki_pos = saved_pos;
- ocfs2_meta_unlock(inode, meta_level);
- meta_level = -1;
+ can_do_direct = direct_io;
+ ret = ocfs2_prepare_inode_for_write(file, ppos,
+ iocb->ki_nbytes, appending,
+ &can_do_direct, &has_refcount);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
- /* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_rw_locked(iocb);
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
+ *ppos);
+
+ /*
+ * We can't complete the direct I/O as requested, fall back to
+ * buffered I/O.
+ */
+ if (direct_io && !can_do_direct) {
+ ocfs2_rw_unlock(inode, rw_level);
+
+ have_alloc_sem = 0;
+ rw_level = -1;
+
+ direct_io = 0;
+ goto relock;
+ }
+
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+ /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
- ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+ /*
+ * To later detect whether a journal commit for sync writes is
+ * necessary, we sample i_size, and cluster count here.
+ */
+ old_size = i_size_read(inode);
+ old_clusters = OCFS2_I(inode)->ip_clusters;
+ /* communicate with ocfs2_dio_end_io */
+ ocfs2_iocb_set_rw_locked(iocb, rw_level);
+
+ ret = generic_write_checks(file, ppos, &count,
+ S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_dio;
+
+ iov_iter_truncate(from, count);
+ if (direct_io) {
+ written = generic_file_direct_write(iocb, from, *ppos);
+ if (written < 0) {
+ ret = written;
+ goto out_dio;
+ }
+ } else {
+ current->backing_dev_info = file->f_mapping->backing_dev_info;
+ written = generic_perform_write(file, from, *ppos);
+ if (likely(written >= 0))
+ iocb->ki_pos = *ppos + written;
+ current->backing_dev_info = NULL;
+ }
+
+out_dio:
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
- /*
+ if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
+ ((file->f_flags & O_DIRECT) && !direct_io)) {
+ ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
+ if (ret < 0)
+ written = ret;
+
+ if (!ret && ((old_size != i_size_read(inode)) ||
+ (old_clusters != OCFS2_I(inode)->ip_clusters) ||
+ has_refcount)) {
+ ret = jbd2_journal_force_commit(osb->journal->j_journal);
+ if (ret < 0)
+ written = ret;
+ }
+
+ if (!ret)
+ ret = filemap_fdatawait_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
+ }
+
+ /*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock. (it's the clustered equivalent of
- * i_alloc_sem; protects truncate from racing with pending ios).
+ * it can unlock our rw lock.
* Unfortunately there are error cases which call end_io and others
* that don't. so we don't have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an
* error has already done it.
*/
- if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+ if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
+ }
+
+ if (unaligned_dio) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
out:
- if (meta_level != -1)
- ocfs2_meta_unlock(inode, meta_level);
- if (have_alloc_sem)
- up_read(&inode->i_alloc_sem);
- if (rw_level != -1)
+ if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
+ if (have_alloc_sem)
+ ocfs2_iocb_clear_sem_locked(iocb);
+
mutex_unlock(&inode->i_mutex);
- mlog_exit(ret);
+ if (written)
+ ret = written;
+ return ret;
+}
+
+static ssize_t ocfs2_file_splice_read(struct file *in,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags)
+{
+ int ret = 0, lock_level = 0;
+ struct inode *inode = file_inode(in);
+
+ trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ in->f_path.dentry->d_name.len,
+ in->f_path.dentry->d_name.name, len);
+
+ /*
+ * See the comment in ocfs2_file_read_iter()
+ */
+ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+ ocfs2_inode_unlock(inode, lock_level);
+
+ ret = generic_file_splice_read(in, ppos, pipe, len, flags);
+
+bail:
return ret;
}
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- char __user *buf,
- size_t count,
- loff_t pos)
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
{
- int ret = 0, rw_level = -1, have_alloc_sem = 0;
+ int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_dentry->d_inode;
+ struct inode *inode = file_inode(filp);
+
+ trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ filp->f_path.dentry->d_name.len,
+ filp->f_path.dentry->d_name.name,
+ to->nr_segs); /* GRRRRR */
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
- (unsigned int)count,
- filp->f_dentry->d_name.len,
- filp->f_dentry->d_name.name);
if (!inode) {
ret = -EINVAL;
@@ -1151,13 +2480,15 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
goto bail;
}
- /*
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ /*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
if (filp->f_flags & O_DIRECT) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ ocfs2_iocb_set_sem_locked(iocb);
ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
@@ -1166,33 +2497,32 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
rw_level = 0;
/* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_rw_locked(iocb);
+ ocfs2_iocb_set_rw_locked(iocb, rw_level);
}
/*
* We're fine letting folks race truncates and extending
* writes with read across the cluster, just like they can
* locally. Hence no rw_lock during read.
- *
+ *
* Take and drop the meta data lock to update inode fields
* like i_size. This allows the checks down below
- * generic_file_aio_read() a chance of actually working.
+ * generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, lock_level);
- ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
- if (ret == -EINVAL)
- mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+ ret = generic_file_read_iter(iocb, to);
+ trace_generic_file_aio_read_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
- /* see ocfs2_file_aio_write */
+ /* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
have_alloc_sem = 0;
@@ -1200,40 +2530,167 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
bail:
if (have_alloc_sem)
- up_read(&inode->i_alloc_sem);
- if (rw_level != -1)
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
- mlog_exit(ret);
return ret;
}
-struct inode_operations ocfs2_file_iops = {
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (whence) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ /* SEEK_END requires the OCFS2 inode lock for the file
+ * because it references the file's size.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ offset += i_size_read(inode);
+ ocfs2_inode_unlock(inode, 0);
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+}
+
+const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
+ .permission = ocfs2_permission,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
+ .fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
-struct inode_operations ocfs2_special_file_iops = {
+const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
+ .permission = ocfs2_permission,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
const struct file_operations ocfs2_fops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .sendfile = generic_file_sendfile,
+ .llseek = ocfs2_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
- .ioctl = ocfs2_ioctl,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
+ .lock = ocfs2_lock,
+ .flock = ocfs2_flock,
+ .splice_read = ocfs2_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate = ocfs2_readdir,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_dir_release,
+ .open = ocfs2_dir_open,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
+ .lock = ocfs2_lock,
+ .flock = ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+ .llseek = ocfs2_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .mmap = ocfs2_mmap,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_file_release,
+ .open = ocfs2_file_open,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
+ .flock = ocfs2_flock,
+ .splice_read = ocfs2_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ocfs2_fallocate,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
- .ioctl = ocfs2_ioctl,
+ .release = ocfs2_dir_release,
+ .open = ocfs2_dir_open,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
+ .flock = ocfs2_flock,
};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 740c9e7ca59..97bf761c9e7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,30 +28,49 @@
extern const struct file_operations ocfs2_fops;
extern const struct file_operations ocfs2_dops;
-extern struct inode_operations ocfs2_file_iops;
-extern struct inode_operations ocfs2_special_file_iops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
+extern const struct inode_operations ocfs2_file_iops;
+extern const struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
-enum ocfs2_alloc_restarted {
- RESTART_NONE = 0,
- RESTART_TRANS,
- RESTART_META
+struct ocfs2_file_private {
+ struct file *fp_file;
+ struct mutex fp_mutex;
+ struct ocfs2_lock_res fp_flock;
};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
- struct inode *inode,
- u32 clusters_to_add,
- struct buffer_head *fe_bh,
- struct ocfs2_journal_handle *handle,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason);
+
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct buffer_head *fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
+int ocfs2_permission(struct inode *inode, int mask);
-int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *fe_bh,
- u64 new_i_size);
+int ocfs2_should_update_atime(struct inode *inode,
+ struct vfsmount *vfsmnt);
+int ocfs2_update_inode_atime(struct inode *inode,
+ struct buffer_head *bh);
+
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+ struct ocfs2_space_resv *sr);
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+ size_t count);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index cbfd45a97a6..d8208b20dc5 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,16 +26,8 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/kmod.h>
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-
-#include <dlm/dlmapi.h>
-
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -44,179 +36,49 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
-#include "vote.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
-#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
-
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
int bit);
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from);
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from);
+
+/* special case -1 for now
+ * TODO: should *really* make sure the calling func never passes -1!! */
+static void ocfs2_node_map_init(struct ocfs2_node_map *map)
+{
+ map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
+ memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
+ sizeof(unsigned long));
+}
void ocfs2_init_node_maps(struct ocfs2_super *osb)
{
spin_lock_init(&osb->node_map_lock);
- ocfs2_node_map_init(&osb->mounted_map);
- ocfs2_node_map_init(&osb->recovery_map);
- ocfs2_node_map_init(&osb->umount_map);
ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
}
-static void ocfs2_do_node_down(int node_num,
- struct ocfs2_super *osb)
+void ocfs2_do_node_down(int node_num, void *data)
{
+ struct ocfs2_super *osb = data;
+
BUG_ON(osb->node_num == node_num);
- mlog(0, "ocfs2: node down event for %d\n", node_num);
+ trace_ocfs2_do_node_down(node_num);
- if (!osb->dlm) {
+ if (!osb->cconn) {
/*
- * No DLM means we're not even ready to participate yet.
- * We check the slots after the DLM comes up, so we will
- * notice the node death then. We can safely ignore it
- * here.
+ * No cluster connection means we're not even ready to
+ * participate yet. We check the slots after the cluster
+ * comes up, so we will notice the node death then. We
+ * can safely ignore it here.
*/
return;
}
- if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
- /* If a node is in the umount map, then we've been
- * expecting him to go down and we know ahead of time
- * that recovery is not necessary. */
- ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
- return;
- }
-
ocfs2_recovery_thread(osb, node_num);
-
- ocfs2_remove_node_from_vote_queues(osb, node_num);
-}
-
-static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
- int node_num,
- void *data)
-{
- ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
-}
-
-/* Called from the dlm when it's about to evict a node. We may also
- * get a heartbeat callback later. */
-static void ocfs2_dlm_eviction_cb(int node_num,
- void *data)
-{
- struct ocfs2_super *osb = (struct ocfs2_super *) data;
- struct super_block *sb = osb->sb;
-
- mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
- MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
-
- ocfs2_do_node_down(node_num, osb);
-}
-
-static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
- int node_num,
- void *data)
-{
- struct ocfs2_super *osb = data;
-
- BUG_ON(osb->node_num == node_num);
-
- mlog(0, "node up event for %d\n", node_num);
- ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
-{
- o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
- ocfs2_hb_node_down_cb, osb,
- OCFS2_HB_NODE_DOWN_PRI);
-
- o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
- ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
-
- /* Not exactly a heartbeat callback, but leads to essentially
- * the same path so we set it up here. */
- dlm_setup_eviction_cb(&osb->osb_eviction_cb,
- ocfs2_dlm_eviction_cb,
- osb);
-}
-
-/* Most functions here are just stubs for now... */
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
-{
- int status;
-
- status = o2hb_register_callback(&osb->osb_hb_down);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = o2hb_register_callback(&osb->osb_hb_up);
- if (status < 0)
- mlog_errno(status);
-
-bail:
- return status;
-}
-
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
-{
- int status;
-
- status = o2hb_unregister_callback(&osb->osb_hb_down);
- if (status < 0)
- mlog_errno(status);
-
- status = o2hb_unregister_callback(&osb->osb_hb_up);
- if (status < 0)
- mlog_errno(status);
-}
-
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
-{
- int ret;
- char *argv[5], *envp[3];
-
- if (!osb->uuid_str) {
- /* This can happen if we don't get far enough in mount... */
- mlog(0, "No UUID with which to stop heartbeat!\n\n");
- return;
- }
-
- argv[0] = (char *)o2nm_get_hb_ctl_path();
- argv[1] = "-K";
- argv[2] = "-u";
- argv[3] = osb->uuid_str;
- argv[4] = NULL;
-
- mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
- /* minimal command environment taken from cpu_run_sbin_hotplug */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- ret = call_usermodehelper(argv[0], argv, envp, 1);
- if (ret < 0)
- mlog_errno(ret);
-}
-
-/* special case -1 for now
- * TODO: should *really* make sure the calling func never passes -1!! */
-void ocfs2_node_map_init(struct ocfs2_node_map *map)
-{
- map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
- memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
- sizeof(unsigned long));
}
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
@@ -270,110 +132,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
return ret;
}
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
- int bit;
- bit = find_next_bit(map->map, map->num_nodes, 0);
- if (bit < map->num_nodes)
- return 0;
- return 1;
-}
-
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
- struct ocfs2_node_map *map)
-{
- int ret;
- BUG_ON(map->num_nodes == 0);
- spin_lock(&osb->node_map_lock);
- ret = __ocfs2_node_map_is_empty(map);
- spin_unlock(&osb->node_map_lock);
- return ret;
-}
-
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from)
-{
- BUG_ON(from->num_nodes == 0);
- ocfs2_node_map_init(target);
- __ocfs2_node_map_set(target, from);
-}
-
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
- struct ocfs2_node_map *target,
- int bit)
-{
- struct ocfs2_node_map temp;
- int ret;
-
- spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_dup(&temp, target);
- __ocfs2_node_map_clear_bit(&temp, bit);
- ret = __ocfs2_node_map_is_empty(&temp);
- spin_unlock(&osb->node_map_lock);
-
- return ret;
-}
-
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
- struct ocfs2_node_map *from)
-{
- int num_longs, i;
-
- BUG_ON(target->num_nodes != from->num_nodes);
- BUG_ON(target->num_nodes == 0);
-
- num_longs = BITS_TO_LONGS(target->num_nodes);
- for (i = 0; i < num_longs; i++)
- target->map[i] = from->map[i];
-}
-
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
- int num)
-{
- int set = 0;
-
- spin_lock(&osb->node_map_lock);
-
- __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
-
- if (!test_bit(num, osb->recovery_map.map)) {
- __ocfs2_node_map_set_bit(&osb->recovery_map, num);
- set = 1;
- }
-
- spin_unlock(&osb->node_map_lock);
-
- return set;
-}
-
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
- int num)
-{
- ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
- struct ocfs2_node_map *map,
- int idx)
-{
- int i = idx;
-
- idx = O2NM_INVALID_NODE_NUM;
- spin_lock(&osb->node_map_lock);
- if ((i != O2NM_INVALID_NODE_NUM) &&
- (i >= 0) &&
- (i < map->num_nodes)) {
- while(i < map->num_nodes) {
- if (test_bit(i, map->map)) {
- idx = i;
- break;
- }
- i++;
- }
- }
- spin_unlock(&osb->node_map_lock);
- return idx;
-}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index e8fb079122e..74b9c5dda28 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,16 +28,10 @@
void ocfs2_init_node_maps(struct ocfs2_super *osb);
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+void ocfs2_do_node_down(int node_num, void *data);
/* node map functions - used to keep track of mounted and in-recovery
* nodes. */
-void ocfs2_node_map_init(struct ocfs2_node_map *map);
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
- struct ocfs2_node_map *map);
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
@@ -47,21 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
- struct ocfs2_node_map *map,
- int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
- struct ocfs2_node_map *map)
-{
- return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
- int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
- int num);
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
- struct ocfs2_node_map *target,
- int bit);
#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 16e8e74dc96..437de7f768c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,19 +25,19 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
+#include <linux/quotaops.h>
#include <asm/byteorder.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "dir.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
@@ -50,7 +50,9 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "vote.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -59,8 +61,11 @@ struct ocfs2_find_inode_args
u64 fi_blkno;
unsigned long fi_ino;
unsigned int fi_flags;
+ unsigned int fi_sysfile_type;
};
+static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
+
static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_find_inode_args *args);
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -89,31 +94,46 @@ void ocfs2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
}
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
- u64 blkno,
- int delete_vote)
+/* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
{
- struct ocfs2_find_inode_args args;
+ unsigned int flags = oi->vfs_inode.i_flags;
+
+ oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL|
+ OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL);
+ if (flags & S_SYNC)
+ oi->ip_attr |= OCFS2_SYNC_FL;
+ if (flags & S_APPEND)
+ oi->ip_attr |= OCFS2_APPEND_FL;
+ if (flags & S_IMMUTABLE)
+ oi->ip_attr |= OCFS2_IMMUTABLE_FL;
+ if (flags & S_NOATIME)
+ oi->ip_attr |= OCFS2_NOATIME_FL;
+ if (flags & S_DIRSYNC)
+ oi->ip_attr |= OCFS2_DIRSYNC_FL;
+}
- /* ocfs2_ilookup_for_vote should *only* be called from the
- * vote thread */
- BUG_ON(current != osb->vote_task);
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+ struct ocfs2_find_inode_args args;
args.fi_blkno = blkno;
- args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
- if (delete_vote)
- args.fi_flags |= OCFS2_FI_FLAG_DELETE;
- args.fi_ino = ino_from_blkno(osb->sb, blkno);
- return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
-}
+ args.fi_flags = 0;
+ args.fi_ino = ino_from_blkno(sb, blkno);
+ args.fi_sysfile_type = 0;
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
+ return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
+ int sysfile_type)
{
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
- mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno);
+ trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+ sysfile_type);
/* Ok. By now we've either got the offsets passed to us by the
* caller, or we just pulled them off the bh. Lets do some
@@ -127,36 +147,60 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
args.fi_blkno = blkno;
args.fi_flags = flags;
args.fi_ino = ino_from_blkno(sb, blkno);
+ args.fi_sysfile_type = sysfile_type;
inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
ocfs2_init_locked_inode, &args);
/* inode was *not* in the inode cache. 2.6.x requires
* us to do our own read_inode call and unlock it
* afterwards. */
- if (inode && inode->i_state & I_NEW) {
- mlog(0, "Inode was not in inode cache, reading it.\n");
- ocfs2_read_locked_inode(inode, &args);
- unlock_new_inode(inode);
- }
if (inode == NULL) {
inode = ERR_PTR(-ENOMEM);
mlog_errno(PTR_ERR(inode));
goto bail;
}
+ trace_ocfs2_iget5_locked(inode->i_state);
+ if (inode->i_state & I_NEW) {
+ ocfs2_read_locked_inode(inode, &args);
+ unlock_new_inode(inode);
+ }
if (is_bad_inode(inode)) {
iput(inode);
inode = ERR_PTR(-ESTALE);
- mlog_errno(PTR_ERR(inode));
goto bail;
}
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ oi->i_sync_tid = tid;
+ oi->i_datasync_tid = tid;
+ }
+
bail:
if (!IS_ERR(inode)) {
- mlog(0, "returning inode with number %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_exit_ptr(inode);
- } else
- mlog_errno(PTR_ERR(inode));
+ trace_ocfs2_iget_end(inode,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ }
return inode;
}
@@ -175,40 +219,17 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
int ret = 0;
- mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
-
args = opaque;
mlog_bug_on_msg(!inode, "No inode in find actor!\n");
- if (oi->ip_blkno != args->fi_blkno)
- goto bail;
+ trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
- /* OCFS2_FI_FLAG_NOWAIT is *only* set from
- * ocfs2_ilookup_for_vote which won't create an inode for one
- * that isn't found. The vote thread which doesn't want to get
- * an inode which is in the process of going away - otherwise
- * the call to __wait_on_freeing_inode in find_inode_fast will
- * cause it to deadlock on an inode which may be waiting on a
- * vote (or lock release) in delete_inode */
- if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
- (inode->i_state & (I_FREEING|I_CLEAR))) {
- /* As stated above, we're not going to return an
- * inode. In the case of a delete vote, the voting
- * code is going to signal the other node to go
- * ahead. Mark that state here, so this freeing inode
- * has the state when it gets to delete_inode. */
- if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
- spin_lock(&oi->ip_lock);
- ocfs2_mark_inode_remotely_deleted(inode);
- spin_unlock(&oi->ip_lock);
- }
+ if (oi->ip_blkno != args->fi_blkno)
goto bail;
- }
ret = 1;
bail:
- mlog_exit(ret);
return ret;
}
@@ -220,63 +241,72 @@ bail:
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
{
struct ocfs2_find_inode_args *args = opaque;
-
- mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+ static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
+ ocfs2_file_ip_alloc_sem_key;
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
+ if (args->fi_sysfile_type != 0)
+ lockdep_set_class(&inode->i_mutex,
+ &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+ if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
+ args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
+ args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+ args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
+ lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+ &ocfs2_quota_ip_alloc_sem_key);
+ else
+ lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+ &ocfs2_file_ip_alloc_sem_key);
- mlog_exit(0);
return 0;
}
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
- int create_ino)
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+ int create_ino)
{
struct super_block *sb;
struct ocfs2_super *osb;
- int status = -EINVAL;
-
- mlog_entry("(0x%p, size:%llu)\n", inode,
- (unsigned long long)fe->i_size);
+ int use_plocks = 1;
sb = inode->i_sb;
osb = OCFS2_SB(sb);
- /* this means that read_inode cannot create a superblock inode
- * today. change if needed. */
- if (!OCFS2_IS_VALID_DINODE(fe) ||
- !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
- "signature = %.*s, flags = 0x%x\n",
- inode->i_ino,
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature, le32_to_cpu(fe->i_flags));
- goto bail;
- }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+ ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+ use_plocks = 0;
- if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
- mlog(ML_ERROR, "file entry generation does not match "
- "superblock! osb->fs_generation=%x, "
- "fe->i_fs_generation=%x\n",
- osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
- goto bail;
- }
+ /*
+ * These have all been checked by ocfs2_read_inode_block() or set
+ * by ocfs2_mknod_locked(), so a failure is a code bug.
+ */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode
+ cannot create a superblock
+ inode today. change if
+ that is needed. */
+ BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+ BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
+
+
+ OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+ OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+ OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
inode->i_version = 1;
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
inode->i_mode = le16_to_cpu(fe->i_mode);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
/* Fast symlinks will have i_size but no allocated clusters. */
- if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
inode->i_blocks = 0;
- else
- inode->i_blocks =
- ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
- inode->i_mapping->a_ops = &ocfs2_aops;
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+ } else {
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ inode->i_mapping->a_ops = &ocfs2_aops;
+ }
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -288,24 +318,24 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
mlog(ML_ERROR,
"ip_blkno %llu != i_blkno %llu!\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)fe->i_blkno);
-
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
- OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
- inode->i_nlink = le16_to_cpu(fe->i_links_count);
+ set_nlink(inode, ocfs2_read_links_count(fe));
- if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+ trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+ le32_to_cpu(fe->i_flags));
+ if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
-
+ inode->i_flags |= S_NOQUOTA;
+ }
+
if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
- mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+ } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+ inode->i_flags |= S_NOQUOTA;
} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
- mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
/* we can't actually hit this as read_inode can't
* handle superblocks today ;-) */
BUG();
@@ -313,20 +343,24 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_fop = &ocfs2_fops;
+ if (use_plocks)
+ inode->i_fop = &ocfs2_fops;
+ else
+ inode->i_fop = &ocfs2_fops_no_plocks;
inode->i_op = &ocfs2_file_iops;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
case S_IFDIR:
inode->i_op = &ocfs2_dir_iops;
- inode->i_fop = &ocfs2_dops;
+ if (use_plocks)
+ inode->i_fop = &ocfs2_dops;
+ else
+ inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
+ OCFS2_I(inode)->ip_dir_lock_gen = 1;
break;
case S_IFLNK:
- if (ocfs2_inode_is_fast_symlink(inode))
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
- else
- inode->i_op = &ocfs2_symlink_inode_operations;
+ inode->i_op = &ocfs2_symlink_inode_operations;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
@@ -345,27 +379,27 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
* the generation argument to
* ocfs2_inode_lock_res_init() will have to change.
*/
- BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL));
+ BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL);
- ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
+
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN, 0, inode);
}
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
OCFS2_LOCK_TYPE_RW, inode->i_generation,
inode);
- ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
- OCFS2_LOCK_TYPE_DATA, inode->i_generation,
- inode);
-
ocfs2_set_inode_flags(inode);
- inode->i_flags |= S_NOATIME;
- status = 0;
-bail:
- mlog_exit(status);
- return status;
+ OCFS2_I(inode)->ip_last_used_slot = 0;
+ OCFS2_I(inode)->ip_last_used_group = 0;
+
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+ OCFS2_RESV_FLAG_DIR);
}
static int ocfs2_read_locked_inode(struct inode *inode,
@@ -378,22 +412,10 @@ static int ocfs2_read_locked_inode(struct inode *inode,
int status, can_lock;
u32 generation = 0;
- mlog_entry("(0x%p, 0x%p)\n", inode, args);
-
status = -EINVAL;
- if (inode == NULL || inode->i_sb == NULL) {
- mlog(ML_ERROR, "bad inode\n");
- return status;
- }
sb = inode->i_sb;
osb = OCFS2_SB(sb);
- if (!args) {
- mlog(ML_ERROR, "bad inode args\n");
- make_bad_inode(inode);
- return status;
- }
-
/*
* To improve performance of cold-cache inode stats, we take
* the cluster lock here if possible.
@@ -418,13 +440,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* #1 and #2 can be simply solved by never taking the lock
* here for system files (which are the only type we read
* during mount). It's a heavier approach, but our main
- * concern is user-accesible files anyway.
+ * concern is user-accessible files anyway.
*
* #3 works itself out because we'll eventually take the
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
- && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK);
+ && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
+ && !ocfs2_mount_local(osb);
+
+ trace_ocfs2_read_locked_inode(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
/*
* To maintain backwards compatibility with older versions of
@@ -436,12 +462,22 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
generation = osb->fs_generation;
- ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres,
OCFS2_LOCK_TYPE_META,
generation, inode);
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN,
+ 0, inode);
+
if (can_lock) {
- status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ status = ocfs2_open_lock(inode);
+ if (status) {
+ make_bad_inode(inode);
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_inode_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
@@ -449,8 +485,26 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
- status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
- can_lock ? inode : NULL);
+ if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+ status = ocfs2_try_open_lock(inode, 0);
+ if (status) {
+ make_bad_inode(inode);
+ return status;
+ }
+ }
+
+ if (can_lock) {
+ status = ocfs2_read_inode_block_full(inode, &bh,
+ OCFS2_BH_IGNORE_CACHE);
+ } else {
+ status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+ /*
+ * If buffer is in jbd, then its checksum may not have been
+ * computed as yet.
+ */
+ if (!status && !buffer_jbd(bh))
+ status = ocfs2_validate_inode_block(osb->sb, bh);
+ }
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -458,11 +512,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
status = -EINVAL;
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)fe->i_blkno, 7, fe->i_signature);
- goto bail;
- }
/*
* This is a code bug. Right now the caller needs to
@@ -476,13 +525,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
S_ISBLK(le16_to_cpu(fe->i_mode)))
- inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+ inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
- if (ocfs2_populate_inode(inode, fe, 0) < 0) {
- mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n",
- (unsigned long long)fe->i_blkno, inode->i_ino);
- goto bail;
- }
+ ocfs2_populate_inode(inode, fe, 0);
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -490,7 +535,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
bail:
if (can_lock)
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
if (status < 0)
make_bad_inode(inode);
@@ -498,7 +543,6 @@ bail:
if (args && bh)
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -512,51 +556,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
- struct ocfs2_journal_handle *handle = NULL;
- struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
-
- mlog_entry_void();
+ handle_t *handle = NULL;
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- /* zero allocation, zero truncate :) */
- if (!fe->i_clusters)
- goto bail;
+ /*
+ * This check will also skip truncate of inodes with inline
+ * data and fast symlinks.
+ */
+ if (fe->i_clusters) {
+ if (ocfs2_should_order_data(inode))
+ ocfs2_begin_ordered_truncate(inode, 0);
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto out;
+ }
- handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto bail;
- }
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
- status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ i_size_write(inode, 0);
- ocfs2_commit_trans(handle);
- handle = NULL;
+ status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
- status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_commit_trans(osb, handle);
+ handle = NULL;
- status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ status = ocfs2_commit_truncate(osb, inode, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
}
-bail:
- if (handle)
- ocfs2_commit_trans(handle);
- mlog_exit(status);
+out:
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
return status;
}
@@ -568,7 +617,7 @@ static int ocfs2_remove_inode(struct inode *inode,
int status;
struct inode *inode_alloc_inode = NULL;
struct buffer_head *inode_alloc_bh = NULL;
- struct ocfs2_journal_handle *handle;
+ handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
@@ -582,7 +631,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
mutex_lock(&inode_alloc_inode->i_mutex);
- status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
+ status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
if (status < 0) {
mutex_unlock(&inode_alloc_inode->i_mutex);
@@ -590,38 +639,37 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail;
}
- handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+ ocfs2_quota_trans_credits(inode->i_sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto bail_unlock;
}
- status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
+ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+ orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
}
/* set the inodes dtime */
- status = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail_commit;
}
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
- le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
-
- status = ocfs2_journal_dirty(handle, di_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
+ di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+ ocfs2_journal_dirty(handle, di_bh);
- ocfs2_remove_from_cache(inode, di_bh);
+ ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
+ dquot_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, di);
@@ -629,9 +677,9 @@ static int ocfs2_remove_inode(struct inode *inode,
mlog_errno(status);
bail_commit:
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_meta_unlock(inode_alloc_inode, 1);
+ ocfs2_inode_unlock(inode_alloc_inode, 1);
mutex_unlock(&inode_alloc_inode->i_mutex);
brelse(inode_alloc_bh);
bail:
@@ -640,7 +688,7 @@ bail:
return status;
}
-/*
+/*
* Serialize with orphan dir recovery. If the process doing
* recovery on this orphan dir does an iget() with the dir
* i_mutex held, we'll deadlock here. Instead we detect this
@@ -653,8 +701,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
spin_lock(&osb->osb_lock);
if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
- mlog(0, "Recovery is happening on orphan dir %d, will skip "
- "this inode\n", slot);
ret = -EDEADLK;
goto out;
}
@@ -663,6 +709,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
osb->osb_orphan_wipes[slot]++;
out:
spin_unlock(&osb->osb_lock);
+ trace_ocfs2_check_orphan_recovery_state(slot, ret);
return ret;
}
@@ -679,43 +726,44 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
static int ocfs2_wipe_inode(struct inode *inode,
struct buffer_head *di_bh)
{
- int status, orphaned_slot;
+ int status, orphaned_slot = -1;
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
- /* We've already voted on this so it should be readonly - no
- * spinlock needed. */
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
- status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
- if (status)
- return status;
+ status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+ if (status)
+ return status;
- orphan_dir_inode = ocfs2_get_system_file_inode(osb,
- ORPHAN_DIR_SYSTEM_INODE,
- orphaned_slot);
- if (!orphan_dir_inode) {
- status = -EEXIST;
- mlog_errno(status);
- goto bail;
- }
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ orphaned_slot);
+ if (!orphan_dir_inode) {
+ status = -EEXIST;
+ mlog_errno(status);
+ goto bail;
+ }
- /* Lock the orphan dir. The lock will be held for the entire
- * delete_inode operation. We do this now to avoid races with
- * recovery completion on other nodes. */
- mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
- if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ /* Lock the orphan dir. The lock will be held for the entire
+ * delete_inode operation. We do this now to avoid races with
+ * recovery completion on other nodes. */
+ mutex_lock(&orphan_dir_inode->i_mutex);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
- mlog_errno(status);
- goto bail;
+ mlog_errno(status);
+ goto bail;
+ }
}
/* we do this while holding the orphan dir lock because we
- * don't want recovery being run from another node to vote for
- * an inode delete on us -- this will result in two nodes
+ * don't want recovery being run from another node to try an
+ * inode delete underneath us -- this will result in two nodes
* truncating the same file! */
status = ocfs2_truncate_for_delete(osb, inode, di_bh);
if (status < 0) {
@@ -723,13 +771,38 @@ static int ocfs2_wipe_inode(struct inode *inode,
goto bail_unlock_dir;
}
+ /* Remove any dir index tree */
+ if (S_ISDIR(inode->i_mode)) {
+ status = ocfs2_dx_dir_truncate(inode, di_bh);
+ if (status) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+ }
+
+ /*Free extended attribute resources associated with this inode.*/
+ status = ocfs2_xattr_remove(inode, di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+
+ status = ocfs2_remove_refcount_tree(inode, di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+
status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
orphan_dir_bh);
if (status < 0)
mlog_errno(status);
bail_unlock_dir:
- ocfs2_meta_unlock(orphan_dir_inode, 1);
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+ return status;
+
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
brelse(orphan_dir_bh);
bail:
@@ -740,13 +813,17 @@ bail:
}
/* There is a series of simple checks that should be done before a
- * vote is even considered. Encapsulate those in this function. */
+ * trylock is even considered. Encapsulate those in this function. */
static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
{
int ret = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+ (unsigned long long)oi->ip_blkno,
+ oi->ip_flags);
+
/* We shouldn't be getting here for the root directory
* inode.. */
if (inode == osb->root_inode) {
@@ -754,16 +831,15 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
- /* If we're coming from process_vote we can't go into our own
- * voting [hello, deadlock city!], so unforuntately we just
- * have to skip deleting this guy. That's OK though because
- * the node who's doing the actual deleting should handle it
- * anyway. */
- if (current == osb->vote_task) {
- mlog(0, "Skipping delete of %lu because we're currently "
- "in process_vote\n", inode->i_ino);
+ /*
+ * If we're coming from downconvert_thread we can't go into our own
+ * voting [hello, deadlock city!] so we cannot delete the inode. But
+ * since we dropped last inode ref when downconverting dentry lock,
+ * we cannot have the file open and thus the node doing unlink will
+ * take care of deleting the inode.
+ */
+ if (current == osb->dc_task)
goto bail;
- }
spin_lock(&oi->ip_lock);
/* OCFS2 *never* deletes system files. This should technically
@@ -775,16 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
- /* If we have voted "yes" on the wipe of this inode for
- * another node, it will be marked here so we can safely skip
- * it. Recovery will cleanup any inodes we might inadvertantly
- * skip here. */
- if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
- mlog(0, "Skipping delete of %lu because another node "
- "has done this for us.\n", inode->i_ino);
- goto bail_unlock;
- }
-
ret = 1;
bail_unlock:
spin_unlock(&oi->ip_lock);
@@ -800,40 +866,53 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
struct buffer_head *di_bh,
int *wipe)
{
- int status = 0;
+ int status = 0, reason = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di;
*wipe = 0;
+ trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+ inode->i_nlink);
+
/* While we were waiting for the cluster lock in
* ocfs2_delete_inode, another node might have asked to delete
* the inode. Recheck our flags to catch this. */
if (!ocfs2_inode_is_valid_to_delete(inode)) {
- mlog(0, "Skipping delete of %llu because flags changed\n",
- (unsigned long long)oi->ip_blkno);
+ reason = 1;
goto bail;
}
/* Now that we have an up to date inode, we can double check
* the link count. */
- if (inode->i_nlink) {
- mlog(0, "Skipping delete of %llu because nlink = %u\n",
- (unsigned long long)oi->ip_blkno, inode->i_nlink);
+ if (inode->i_nlink)
goto bail;
- }
/* Do some basic inode verification... */
di = (struct ocfs2_dinode *) di_bh->b_data;
- if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+ if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+ !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ /*
+ * Inodes in the orphan dir must have ORPHANED_FL. The only
+ * inodes that come back out of the orphan dir are reflink
+ * targets. A reflink target may be moved out of the orphan
+ * dir between the time we scan the directory and the time we
+ * process it. This would lead to HAS_REFCOUNT_FL being set but
+ * ORPHANED_FL not.
+ */
+ if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+ reason = 2;
+ goto bail;
+ }
+
/* for lack of a better error? */
status = -EEXIST;
mlog(ML_ERROR,
"Inode %llu (on-disk %llu) not orphaned! "
"Disk flags 0x%x, inode flags 0x%x\n",
(unsigned long long)oi->ip_blkno,
- (unsigned long long)di->i_blkno, di->i_flags,
- oi->ip_flags);
+ (unsigned long long)le64_to_cpu(di->i_blkno),
+ le32_to_cpu(di->i_flags), oi->ip_flags);
goto bail;
}
@@ -844,14 +923,22 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- status = ocfs2_request_delete_vote(inode);
- /* -EBUSY means that other nodes are still using the
- * inode. We're done here though, so avoid doing anything on
- * disk and let them worry about deleting it. */
- if (status == -EBUSY) {
+ /*
+ * This is how ocfs2 determines whether an inode is still live
+ * within the cluster. Every node takes a shared read lock on
+ * the inode open lock in ocfs2_read_locked_inode(). When we
+ * get to ->delete_inode(), each node tries to convert it's
+ * lock to an exclusive. Trylocks are serialized by the inode
+ * meta data lock. If the upconvert succeeds, we know the inode
+ * is no longer live and can be deleted.
+ *
+ * Though we call this with the meta data lock held, the
+ * trylock keeps us from ABBA deadlock.
+ */
+ status = ocfs2_try_open_lock(inode, 1);
+ if (status == -EAGAIN) {
status = 0;
- mlog(0, "Skipping delete of %llu because it is in use on"
- "other nodes\n", (unsigned long long)oi->ip_blkno);
+ reason = 3;
goto bail;
}
if (status < 0) {
@@ -859,23 +946,11 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
goto bail;
}
- spin_lock(&oi->ip_lock);
- if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
- /* Nobody knew which slot this inode was orphaned
- * into. This may happen during node death and
- * recovery knows how to clean it up so we can safely
- * ignore this inode for now on. */
- mlog(0, "Nobody knew where inode %llu was orphaned!\n",
- (unsigned long long)oi->ip_blkno);
- } else {
- *wipe = 1;
-
- mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
- (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
- }
- spin_unlock(&oi->ip_lock);
+ *wipe = 1;
+ trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
bail:
+ trace_ocfs2_query_inode_wipe_end(status, reason);
return status;
}
@@ -885,25 +960,28 @@ bail:
static void ocfs2_cleanup_delete_inode(struct inode *inode,
int sync_data)
{
- mlog(0, "Cleanup inode %llu, sync = %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+ trace_ocfs2_cleanup_delete_inode(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
- write_inode_now(inode, 1);
- truncate_inode_pages(&inode->i_data, 0);
+ filemap_write_and_wait(inode->i_mapping);
+ truncate_inode_pages_final(&inode->i_data);
}
-void ocfs2_delete_inode(struct inode *inode)
+static void ocfs2_delete_inode(struct inode *inode)
{
int wipe, status;
- sigset_t blocked, oldset;
+ sigset_t oldset;
struct buffer_head *di_bh = NULL;
- mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+ trace_ocfs2_delete_inode(inode->i_ino,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ is_bad_inode(inode));
- if (is_bad_inode(inode)) {
- mlog(0, "Skipping delete of bad inode\n");
+ /* When we fail in read_inode() we mark inode as bad. The second test
+ * catches the case when inode allocation fails before allocating
+ * a block for inode. */
+ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
goto bail;
- }
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
@@ -913,47 +991,54 @@ void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
/* We want to block signals in delete_inode as the lock and
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned
* forever. */
- sigfillset(&blocked);
- status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+ ocfs2_block_signals(&oldset);
+
+ /*
+ * Synchronize us against ocfs2_get_dentry. We take this in
+ * shared mode so that all nodes can still concurrently
+ * process deletes.
+ */
+ status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
if (status < 0) {
- mlog_errno(status);
- ocfs2_cleanup_delete_inode(inode, 1);
- goto bail;
+ mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unblock;
}
-
/* Lock down the inode. This gives us an up to date view of
* it's metadata (for verification), and allows us to
- * serialize delete_inode votes.
+ * serialize delete_inode on multiple nodes.
*
* Even though we might be doing a truncate, we don't take the
* allocation lock here as it won't be needed - nobody will
* have the file open.
*/
- status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
ocfs2_cleanup_delete_inode(inode, 0);
- goto bail_unblock;
+ goto bail_unlock_nfs_sync;
}
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
if (!wipe || status < 0) {
- /* Error and inode busy vote both mean we won't be
+ /* Error and remote inode busy both mean we won't be
* removing the inode, so they take almost the same
* path. */
if (status < 0)
mlog_errno(status);
- /* Someone in the cluster has voted to not wipe this
- * inode, or it was never completely orphaned. Write
- * out the pages and exit now. */
+ /* Someone in the cluster has disallowed a wipe of
+ * this inode, or it was never completely
+ * orphaned. Write out the pages and exit now. */
ocfs2_cleanup_delete_inode(inode, 1);
goto bail_unlock_inode;
}
@@ -979,38 +1064,46 @@ void ocfs2_delete_inode(struct inode *inode)
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
bail_unlock_inode:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+
+bail_unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
+
bail_unblock:
- status = sigprocmask(SIG_SETMASK, &oldset, NULL);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_unblock_signals(&oldset);
bail:
- clear_inode(inode);
- mlog_exit_void();
+ return;
}
-void ocfs2_clear_inode(struct inode *inode)
+static void ocfs2_clear_inode(struct inode *inode)
{
int status;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
- if (!inode)
- goto bail;
-
- mlog(0, "Clearing inode: %llu, nlink = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+ clear_inode(inode);
+ trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+ inode->i_nlink);
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ dquot_drop(inode);
+
+ /* To preven remote deletes we hold open lock before, now it
+ * is time to unlock PR and EX open locks. */
+ ocfs2_open_unlock(inode);
+
/* Do these before all the other work so that we don't bounce
- * the vote thread while waiting to destroy the locks. */
- ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+ * the downconvert thread while waiting to destroy the locks. */
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+ ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ &oi->ip_la_data_resv);
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@@ -1025,24 +1118,24 @@ void ocfs2_clear_inode(struct inode *inode)
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
- ocfs2_extent_map_drop(inode, 0);
- ocfs2_extent_map_init(inode);
+ ocfs2_extent_map_trunc(inode, 0);
status = ocfs2_drop_inode_locks(inode);
if (status < 0)
mlog_errno(status);
ocfs2_lock_res_free(&oi->ip_rw_lockres);
- ocfs2_lock_res_free(&oi->ip_meta_lockres);
- ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_lock_res_free(&oi->ip_inode_lockres);
+ ocfs2_lock_res_free(&oi->ip_open_lockres);
- ocfs2_metadata_cache_purge(inode);
+ ocfs2_metadata_cache_exit(INODE_CACHE(inode));
- mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+ mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
"Clear inode of %llu, inode has %u cache items\n",
- (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+ (unsigned long long)oi->ip_blkno,
+ INODE_CACHE(inode)->ci_num_cached);
- mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+ mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
"Clear inode of %llu, inode has a bad flag\n",
(unsigned long long)oi->ip_blkno);
@@ -1067,95 +1160,49 @@ void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(oi->ip_open_count,
"Clear inode of %llu has open count %d\n",
(unsigned long long)oi->ip_blkno, oi->ip_open_count);
- mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
- "Clear inode of %llu has non empty handle list\n",
- (unsigned long long)oi->ip_blkno);
- mlog_bug_on_msg(oi->ip_handle,
- "Clear inode of %llu has non empty handle pointer\n",
- (unsigned long long)oi->ip_blkno);
/* Clear all other flags. */
- oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
- oi->ip_created_trans = 0;
- oi->ip_last_trans = 0;
+ oi->ip_flags = 0;
oi->ip_dir_start_lookup = 0;
oi->ip_blkno = 0ULL;
-bail:
- mlog_exit_void();
+ /*
+ * ip_jinode is used to track txns against this inode. We ensure that
+ * the journal is flushed before journal shutdown. Thus it is safe to
+ * have inodes get cleaned up after journal shutdown.
+ */
+ jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+ &oi->ip_jinode);
+}
+
+void ocfs2_evict_inode(struct inode *inode)
+{
+ if (!inode->i_nlink ||
+ (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+ ocfs2_delete_inode(inode);
+ } else {
+ truncate_inode_pages_final(&inode->i_data);
+ }
+ ocfs2_clear_inode(inode);
}
/* Called under inode_lock, with no more references on the
* struct inode, so it's safe here to check the flags field
* and to manipulate i_nlink without any other locks. */
-void ocfs2_drop_inode(struct inode *inode)
+int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ int res;
- mlog_entry_void();
-
- mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
- (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+ trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+ inode->i_nlink, oi->ip_flags);
- /* Testing ip_orphaned_slot here wouldn't work because we may
- * not have gotten a delete_inode vote from any other nodes
- * yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- generic_delete_inode(inode);
+ res = 1;
else
- generic_drop_inode(inode);
-
- mlog_exit_void();
-}
-
-/*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada)
-{
- struct buffer_head *bh = NULL;
- int tmperr;
- u64 p_blkno;
- int readflags = OCFS2_BH_CACHED;
-
- if (reada)
- readflags |= OCFS2_BH_READAHEAD;
-
- if (((u64)block << inode->i_sb->s_blocksize_bits) >=
- i_size_read(inode)) {
- BUG_ON(!reada);
- return NULL;
- }
-
- tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
- &p_blkno, NULL);
- if (tmperr < 0) {
- mlog_errno(tmperr);
- goto fail;
- }
-
- tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
- readflags, inode);
- if (tmperr < 0)
- goto fail;
+ res = generic_drop_inode(inode);
- tmperr = 0;
-
- *err = 0;
- return bh;
-
-fail:
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
- *err = -EIO;
- return NULL;
+ return res;
}
/*
@@ -1166,11 +1213,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int status = 0;
- mlog_entry("(inode = 0x%p, ino = %llu)\n", inode,
- inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+ trace_ocfs2_inode_revalidate(inode,
+ inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+ inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
if (!inode) {
- mlog(0, "eep, no inode!\n");
status = -ENOENT;
goto bail;
}
@@ -1178,24 +1225,21 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
spin_lock(&OCFS2_I(inode)->ip_lock);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
- mlog(0, "inode deleted!\n");
status = -ENOENT;
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
- /* Let ocfs2_meta_lock do the work of updating our struct
+ /* Let ocfs2_inode_lock do the work of updating our struct
* inode for us. */
- status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
bail:
- mlog_exit(status);
-
return status;
}
@@ -1204,18 +1248,17 @@ bail:
* struct inode.
* Only takes ip_lock.
*/
-int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh)
{
int status;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
- mlog_entry("(inode %llu)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1223,13 +1266,15 @@ int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
spin_lock(&OCFS2_I(inode)->ip_lock);
fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+ ocfs2_get_inode_flags(OCFS2_I(inode));
fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
+ fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
spin_unlock(&OCFS2_I(inode)->ip_lock);
fe->i_size = cpu_to_le64(i_size_read(inode));
- fe->i_links_count = cpu_to_le16(inode->i_nlink);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ ocfs2_set_links_count(fe, inode->i_nlink);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
@@ -1238,14 +1283,9 @@ int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
-
- status = 0;
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
-
- mlog_exit(status);
return status;
}
@@ -1261,16 +1301,17 @@ void ocfs2_refresh_inode(struct inode *inode,
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+ OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
ocfs2_set_inode_flags(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
- inode->i_nlink = le16_to_cpu(fe->i_links_count);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ set_nlink(inode, ocfs2_read_links_count(fe));
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
inode->i_mode = le16_to_cpu(fe->i_mode);
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0;
else
- inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -1280,3 +1321,141 @@ void ocfs2_refresh_inode(struct inode *inode,
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ goto bail;
+ }
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ rc = -EINVAL;
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
+ goto bail;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ goto bail;
+ }
+
+ rc = 0;
+
+bail:
+ return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+ int flags)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags, ocfs2_validate_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+ return ocfs2_read_inode_block_full(inode, bh, 0);
+}
+
+
+static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ return oi->ip_blkno;
+}
+
+static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ return oi->vfs_inode.i_sb;
+}
+
+static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ spin_lock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ mutex_lock(&oi->ip_io_mutex);
+}
+
+static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ mutex_unlock(&oi->ip_io_mutex);
+}
+
+const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
+ .co_owner = ocfs2_inode_cache_owner,
+ .co_get_super = ocfs2_inode_cache_get_super,
+ .co_cache_lock = ocfs2_inode_cache_lock,
+ .co_cache_unlock = ocfs2_inode_cache_unlock,
+ .co_io_lock = ocfs2_inode_cache_io_lock,
+ .co_io_unlock = ocfs2_inode_cache_io_unlock,
+};
+
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9957810fdf8..a6c991c0fc9 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,52 +26,60 @@
#ifndef OCFS2_INODE_H
#define OCFS2_INODE_H
+#include "extent_map.h"
+
/* OCFS2 Inode Private Data */
struct ocfs2_inode_info
{
u64 ip_blkno;
struct ocfs2_lock_res ip_rw_lockres;
- struct ocfs2_lock_res ip_meta_lockres;
- struct ocfs2_lock_res ip_data_lockres;
+ struct ocfs2_lock_res ip_inode_lockres;
+ struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
+ /* protects extended attribute changes on this inode */
+ struct rw_semaphore ip_xattr_sem;
+
+ /* Number of outstanding AIO's which are not page aligned */
+ struct mutex ip_unaligned_aio;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
- u32 ip_clusters;
- struct ocfs2_extent_map ip_map;
struct list_head ip_io_markers;
- int ip_orphaned_slot;
+ u32 ip_clusters;
+ u16 ip_dyn_features;
struct mutex ip_io_mutex;
-
- /* Used by the journalling code to attach an inode to a
- * handle. These are protected by ip_io_mutex in order to lock
- * out other I/O to the inode until we either commit or
- * abort. */
- struct list_head ip_handle_list;
- struct ocfs2_journal_handle *ip_handle;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
+ struct ocfs2_caching_info ip_metadata_cache;
+ struct ocfs2_extent_map ip_extent_map;
+ struct inode vfs_inode;
+ struct jbd2_inode ip_jinode;
+
u32 ip_dir_start_lookup;
- /* next two are protected by trans_inc_lock */
- /* which transaction were we created on? Zero if none. */
- unsigned long ip_created_trans;
- /* last transaction we were a part of. */
- unsigned long ip_last_trans;
+ /* Only valid if the inode is the dir. */
+ u32 ip_last_used_slot;
+ u64 ip_last_used_group;
+ u32 ip_dir_lock_gen;
- struct ocfs2_caching_info ip_metadata_cache;
+ struct ocfs2_alloc_reservation ip_la_data_resv;
- struct inode vfs_inode;
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -83,8 +91,6 @@ struct ocfs2_inode_info
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE 0x00000010
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -99,11 +105,11 @@ struct ocfs2_inode_info
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
-#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
+#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
/* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT 0x00000040
-/* Indicates that the metadata cache should be used as an array. */
-#define OCFS2_INODE_CACHE_INLINE 0x00000080
+#define OCFS2_INODE_OPEN_DIRECT 0x00000020
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
@@ -113,29 +119,29 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
-extern kmem_cache_t *ocfs2_inode_cache;
+extern struct kmem_cache *ocfs2_inode_cache;
extern const struct address_space_operations ocfs2_aops;
+extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
- int *err, int reada);
-void ocfs2_clear_inode(struct inode *inode);
-void ocfs2_delete_inode(struct inode *inode);
-void ocfs2_drop_inode(struct inode *inode);
+static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
+{
+ return &OCFS2_I(inode)->ip_metadata_cache;
+}
+
+void ocfs2_evict_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT 0x1
-#define OCFS2_FI_FLAG_DELETE 0x2
-#define OCFS2_FI_FLAG_SYSFILE 0x4
-#define OCFS2_FI_FLAG_NOLOCK 0x8
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
- u64 blkno,
- int delete_vote);
+#define OCFS2_FI_FLAG_SYSFILE 0x1
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
+ int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
- int create_ino);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+ int create_ino);
void ocfs2_read_inode(struct inode *inode);
void ocfs2_read_inode2(struct inode *inode, void *opaque);
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -143,12 +149,38 @@ ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
void ocfs2_sync_blockdev(struct super_block *sb);
void ocfs2_refresh_inode(struct inode *inode,
struct ocfs2_dinode *fe);
-int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
+int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+ int block, int *err, int reada);
void ocfs2_set_inode_flags(struct inode *inode);
+void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
+
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+ int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+
+ return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
+
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+/*
+ * Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
+ * This is a cached read. The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+ int flags);
+
+static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
+{
+ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
+}
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3663cef8068..6f66b3751ac 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,34 +7,73 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
+#include "file.h"
#include "inode.h"
#include "journal.h"
#include "ocfs2_fs.h"
#include "ioctl.h"
+#include "resize.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
-#include <linux/ext2_fs.h>
+#define o2info_from_user(a, b) \
+ copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b) \
+ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT. The error will be returned from the ioctl(2) call. It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
+ struct ocfs2_info_request __user *req)
+{
+ kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+ (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+ req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+ req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+ return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
int status;
- status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
return status;
}
+ ocfs2_get_inode_flags(OCFS2_I(inode));
*flags = OCFS2_I(inode)->ip_attr;
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
- mlog_exit(status);
return status;
}
@@ -43,37 +82,26 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
{
struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
struct buffer_head *bh = NULL;
unsigned oldflags;
int status;
mutex_lock(&inode->i_mutex);
- status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+ status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = -EROFS;
- if (IS_RDONLY(inode))
- goto bail_unlock;
-
status = -EACCES;
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!inode_owner_or_capable(inode))
goto bail_unlock;
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
- handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
- }
-
oldflags = ocfs2_inode->ip_attr;
flags = flags & mask;
flags |= oldflags & ~mask;
@@ -89,6 +117,13 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail_unlock;
}
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
+
ocfs2_inode->ip_attr = flags;
ocfs2_set_inode_flags(inode);
@@ -96,24 +131,773 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
if (status < 0)
mlog_errno(status);
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
+
bail_unlock:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail:
mutex_unlock(&inode->i_mutex);
- if (bh)
- brelse(bh);
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_blocksize oib;
+
+ if (o2info_from_user(oib, req))
+ goto bail;
+
+ oib.ib_blocksize = inode->i_sb->s_blocksize;
+
+ o2info_set_request_filled(&oib.ib_req);
+
+ if (o2info_to_user(oib, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oib.ib_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_clustersize oic;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oic, req))
+ goto bail;
+
+ oic.ic_clustersize = osb->s_clustersize;
+
+ o2info_set_request_filled(&oic.ic_req);
+
+ if (o2info_to_user(oic, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oic.ic_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_maxslots oim;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oim, req))
+ goto bail;
+
+ oim.im_max_slots = osb->max_slots;
+
+ o2info_set_request_filled(&oim.im_req);
+
+ if (o2info_to_user(oim, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oim.im_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_label oil;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oil, req))
+ goto bail;
+
+ memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+
+ o2info_set_request_filled(&oil.il_req);
+
+ if (o2info_to_user(oil, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oil.il_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_uuid oiu;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oiu, req))
+ goto bail;
+
+ memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+
+ o2info_set_request_filled(&oiu.iu_req);
+
+ if (o2info_to_user(oiu, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiu.iu_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_fs_features oif;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oif, req))
+ goto bail;
+
+ oif.if_compat_features = osb->s_feature_compat;
+ oif.if_incompat_features = osb->s_feature_incompat;
+ oif.if_ro_compat_features = osb->s_feature_ro_compat;
+
+ o2info_set_request_filled(&oif.if_req);
+
+ if (o2info_to_user(oif, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oif.if_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_journal_size oij;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oij, req))
+ goto bail;
+
+ oij.ij_journal_size = i_size_read(osb->journal->j_inode);
+
+ o2info_set_request_filled(&oij.ij_req);
+
+ if (o2info_to_user(oij, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oij.ij_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+ struct inode *inode_alloc, u64 blkno,
+ struct ocfs2_info_freeinode *fi,
+ u32 slot)
+{
+ int status = 0, unlock = 0;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *dinode_alloc = NULL;
+
+ if (inode_alloc)
+ mutex_lock(&inode_alloc->i_mutex);
+
+ if (o2info_coherent(&fi->ifi_req)) {
+ status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+ fi->ifi_stat[slot].lfi_total =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+ fi->ifi_stat[slot].lfi_free =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(inode_alloc, 0);
+
+ if (inode_alloc)
+ mutex_unlock(&inode_alloc->i_mutex);
+
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_freeinode(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u32 i;
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ struct ocfs2_info_freeinode *oifi = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *inode_alloc = NULL;
+
+ oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+ if (!oifi) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ if (o2info_from_user(*oifi, req))
+ goto bail;
+
+ oifi->ifi_slotnum = osb->max_slots;
+
+ for (i = 0; i < oifi->ifi_slotnum; i++) {
+ if (o2info_coherent(&oifi->ifi_req)) {
+ inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+ if (!inode_alloc) {
+ mlog(ML_ERROR, "unable to get alloc inode in "
+ "slot %u\n", i);
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+
+ iput(inode_alloc);
+ inode_alloc = NULL;
+
+ if (status < 0)
+ goto bail;
+ }
+
+ o2info_set_request_filled(&oifi->ifi_req);
+
+ if (o2info_to_user(*oifi, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oifi->ifi_req, req);
+
+ kfree(oifi);
+out_err:
+ return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+ unsigned int chunksize)
+{
+ int index;
+
+ index = __ilog2_u32(chunksize);
+ if (index >= OCFS2_INFO_MAX_HIST)
+ index = OCFS2_INFO_MAX_HIST - 1;
+
+ hist->fc_chunks[index]++;
+ hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+ unsigned int chunksize)
+{
+ if (chunksize > stats->ffs_max)
+ stats->ffs_max = chunksize;
+
+ if (chunksize < stats->ffs_min)
+ stats->ffs_min = chunksize;
+
+ stats->ffs_avg += chunksize;
+ stats->ffs_free_chunks_real++;
+}
+
+static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+ unsigned int chunksize)
+{
+ o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+ o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+ struct inode *gb_inode,
+ struct ocfs2_dinode *gb_dinode,
+ struct ocfs2_chain_rec *rec,
+ struct ocfs2_info_freefrag *ffg,
+ u32 chunks_in_group)
+{
+ int status = 0, used;
+ u64 blkno;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_group_desc *bg = NULL;
+
+ unsigned int max_bits, num_clusters;
+ unsigned int offset = 0, cluster, chunk;
+ unsigned int chunk_free, last_chunksize = 0;
+
+ if (!le32_to_cpu(rec->c_free))
+ goto bail;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (bh) {
+ brelse(bh);
+ bh = NULL;
+ }
+
+ if (o2info_coherent(&ffg->iff_req))
+ status = ocfs2_read_group_descriptor(gb_inode,
+ gb_dinode,
+ blkno, &bh);
+ else
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+ if (status < 0) {
+ mlog(ML_ERROR, "Can't read the group descriptor # "
+ "%llu from device.", (unsigned long long)blkno);
+ status = -EIO;
+ goto bail;
+ }
+
+ bg = (struct ocfs2_group_desc *)bh->b_data;
+
+ if (!le16_to_cpu(bg->bg_free_bits_count))
+ continue;
+
+ max_bits = le16_to_cpu(bg->bg_bits);
+ offset = 0;
+
+ for (chunk = 0; chunk < chunks_in_group; chunk++) {
+ /*
+ * last chunk may be not an entire one.
+ */
+ if ((offset + ffg->iff_chunksize) > max_bits)
+ num_clusters = max_bits - offset;
+ else
+ num_clusters = ffg->iff_chunksize;
+
+ chunk_free = 0;
+ for (cluster = 0; cluster < num_clusters; cluster++) {
+ used = ocfs2_test_bit(offset,
+ (unsigned long *)bg->bg_bitmap);
+ /*
+ * - chunk_free counts free clusters in #N chunk.
+ * - last_chunksize records the size(in) clusters
+ * for the last real free chunk being counted.
+ */
+ if (!used) {
+ last_chunksize++;
+ chunk_free++;
+ }
+
+ if (used && last_chunksize) {
+ ocfs2_info_update_ffg(ffg,
+ last_chunksize);
+ last_chunksize = 0;
+ }
+
+ offset++;
+ }
+
+ if (chunk_free == ffg->iff_chunksize)
+ ffg->iff_ffs.ffs_free_chunks++;
+ }
+
+ /*
+ * need to update the info for last free chunk.
+ */
+ if (last_chunksize)
+ ocfs2_info_update_ffg(ffg, last_chunksize);
+
+ } while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+ struct inode *gb_inode, u64 blkno,
+ struct ocfs2_info_freefrag *ffg)
+{
+ u32 chunks_in_group;
+ int status = 0, unlock = 0, i;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_chain_list *cl = NULL;
+ struct ocfs2_chain_rec *rec = NULL;
+ struct ocfs2_dinode *gb_dinode = NULL;
+
+ if (gb_inode)
+ mutex_lock(&gb_inode->i_mutex);
+
+ if (o2info_coherent(&ffg->iff_req)) {
+ status = ocfs2_inode_lock(gb_inode, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+ cl = &(gb_dinode->id2.i_chain);
+
+ /*
+ * Chunksize(in) clusters from userspace should be
+ * less than clusters in a group.
+ */
+ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+ ffg->iff_ffs.ffs_min = ~0U;
+ ffg->iff_ffs.ffs_clusters =
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+ ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+ chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+ rec = &(cl->cl_recs[i]);
+ status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+ gb_dinode,
+ rec, ffg,
+ chunks_in_group);
+ if (status)
+ goto bail;
+ }
+
+ if (ffg->iff_ffs.ffs_free_chunks_real)
+ ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+ ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(gb_inode, 0);
+
+ if (gb_inode)
+ mutex_unlock(&gb_inode->i_mutex);
+
+ if (gb_inode)
+ iput(gb_inode);
+
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_freefrag(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+ struct ocfs2_info_freefrag *oiff;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *gb_inode = NULL;
+
+ oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+ if (!oiff) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ if (o2info_from_user(*oiff, req))
+ goto bail;
+ /*
+ * chunksize from userspace should be power of 2.
+ */
+ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+ (!oiff->iff_chunksize)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (o2info_coherent(&oiff->iff_req)) {
+ gb_inode = ocfs2_get_system_file_inode(osb, type,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+ OCFS2_INVALID_SLOT);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+ if (status < 0)
+ goto bail;
+
+ o2info_set_request_filled(&oiff->iff_req);
- mlog_exit(status);
+ if (o2info_to_user(*oiff, req)) {
+ status = -EFAULT;
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiff->iff_req, req);
+
+ kfree(oiff);
+out_err:
return status;
}
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
- unsigned int cmd, unsigned long arg)
+static int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ o2info_clear_request_filled(&oir);
+
+ if (o2info_to_user(oir, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oir, req);
+
+ return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+static int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ status = -EINVAL;
+ if (oir.ir_magic != OCFS2_INFO_MAGIC)
+ goto bail;
+
+ switch (oir.ir_code) {
+ case OCFS2_INFO_BLOCKSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+ status = ocfs2_info_handle_blocksize(inode, req);
+ break;
+ case OCFS2_INFO_CLUSTERSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+ status = ocfs2_info_handle_clustersize(inode, req);
+ break;
+ case OCFS2_INFO_MAXSLOTS:
+ if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+ status = ocfs2_info_handle_maxslots(inode, req);
+ break;
+ case OCFS2_INFO_LABEL:
+ if (oir.ir_size == sizeof(struct ocfs2_info_label))
+ status = ocfs2_info_handle_label(inode, req);
+ break;
+ case OCFS2_INFO_UUID:
+ if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+ status = ocfs2_info_handle_uuid(inode, req);
+ break;
+ case OCFS2_INFO_FS_FEATURES:
+ if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+ status = ocfs2_info_handle_fs_features(inode, req);
+ break;
+ case OCFS2_INFO_JOURNAL_SIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+ status = ocfs2_info_handle_journal_size(inode, req);
+ break;
+ case OCFS2_INFO_FREEINODE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+ status = ocfs2_info_handle_freeinode(inode, req);
+ break;
+ case OCFS2_INFO_FREEFRAG:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+ status = ocfs2_info_handle_freefrag(inode, req);
+ break;
+ default:
+ status = ocfs2_info_handle_unknown(inode, req);
+ break;
+ }
+
+bail:
+ return status;
+}
+
+static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
+{
+ int status = -EFAULT;
+ u64 __user *bp = NULL;
+
+ if (compat_flag) {
+#ifdef CONFIG_COMPAT
+ /*
+ * pointer bp stores the base address of a pointers array,
+ * which collects all addresses of separate request.
+ */
+ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+ BUG();
+#endif
+ } else
+ bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+ if (o2info_from_user(*req_addr, bp + idx))
+ goto bail;
+
+ status = 0;
+bail:
+ return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
+{
+ int i, status = 0;
+ u64 req_addr;
+ struct ocfs2_info_request __user *reqp;
+
+ if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+ (!info->oi_requests)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ for (i = 0; i < info->oi_count; i++) {
+
+ status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+ if (status)
+ break;
+
+ reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
+ if (!reqp) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_info_handle_request(inode, reqp);
+ if (status)
+ break;
+ }
+
+bail:
+ return status;
+}
+
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
unsigned int flags;
+ int new_clusters;
int status;
+ struct ocfs2_space_resv sr;
+ struct ocfs2_new_group_input input;
+ struct reflink_arguments args;
+ const char __user *old_path;
+ const char __user *new_path;
+ bool preserve;
+ struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -127,10 +911,138 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- return ocfs2_set_inode_attr(inode, flags,
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
+ mnt_drop_write_file(filp);
+ return status;
+ case OCFS2_IOC_RESVSP:
+ case OCFS2_IOC_RESVSP64:
+ case OCFS2_IOC_UNRESVSP:
+ case OCFS2_IOC_UNRESVSP64:
+ if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
+ return -EFAULT;
+
+ return ocfs2_change_file_space(filp, cmd, &sr);
+ case OCFS2_IOC_GROUP_EXTEND:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (get_user(new_clusters, (int __user *)arg))
+ return -EFAULT;
+
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_extend(inode, new_clusters);
+ mnt_drop_write_file(filp);
+ return status;
+ case OCFS2_IOC_GROUP_ADD:
+ case OCFS2_IOC_GROUP_ADD64:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_add(inode, &input);
+ mnt_drop_write_file(filp);
+ return status;
+ case OCFS2_IOC_REFLINK:
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ old_path = (const char __user *)(unsigned long)args.old_path;
+ new_path = (const char __user *)(unsigned long)args.new_path;
+ preserve = (args.preserve != 0);
+
+ return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 0);
+ case FITRIM:
+ {
+ struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen);
+ ret = ocfs2_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case OCFS2_IOC_MOVE_EXT:
+ return ocfs2_ioctl_move_extents(filp, argp);
default:
return -ENOTTY;
}
}
+#ifdef CONFIG_COMPAT
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+ bool preserve;
+ struct reflink_arguments args;
+ struct inode *inode = file_inode(file);
+ struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case OCFS2_IOC32_GETFLAGS:
+ cmd = OCFS2_IOC_GETFLAGS;
+ break;
+ case OCFS2_IOC32_SETFLAGS:
+ cmd = OCFS2_IOC_SETFLAGS;
+ break;
+ case OCFS2_IOC_RESVSP:
+ case OCFS2_IOC_RESVSP64:
+ case OCFS2_IOC_UNRESVSP:
+ case OCFS2_IOC_UNRESVSP64:
+ case OCFS2_IOC_GROUP_EXTEND:
+ case OCFS2_IOC_GROUP_ADD:
+ case OCFS2_IOC_GROUP_ADD64:
+ case FITRIM:
+ break;
+ case OCFS2_IOC_REFLINK:
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ preserve = (args.preserve != 0);
+
+ return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
+ compat_ptr(args.new_path), preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 1);
+ case OCFS2_IOC_MOVE_EXT:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return ocfs2_ioctl(file, cmd, arg);
+}
+#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4a7c82931db..0cd5323bd3f 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
*
*/
-#ifndef OCFS2_IOCTL_H
-#define OCFS2_IOCTL_H
+#ifndef OCFS2_IOCTL_PROTO_H
+#define OCFS2_IOCTL_PROTO_H
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
- unsigned int cmd, unsigned long arg);
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
-#endif /* OCFS2_IOCTL_H */
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index fd9734def55..4b0c68849b3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,295 +28,386 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
+#include <linux/delay.h>
-#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
+#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
#include "localalloc.h"
-#include "namei.h"
#include "slot_map.h"
#include "super.h"
-#include "vote.h"
#include "sysfile.h"
+#include "uptodate.h"
+#include "quota.h"
#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
- int node_num);
+ int node_num, int slot_num);
static int __ocfs2_recovery_thread(void *arg);
static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
-static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
- struct ocfs2_journal_handle *handle);
-static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
- int dirty);
+ int dirty, int replayed);
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
int slot);
static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+ int slot_num,
+ struct ocfs2_dinode *la_dinode,
+ struct ocfs2_dinode *tl_dinode,
+ struct ocfs2_quota_recovery *qrec);
-static int ocfs2_commit_cache(struct ocfs2_super *osb)
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
{
- int status = 0;
- unsigned int flushed;
- unsigned long old_id;
- struct ocfs2_journal *journal = NULL;
+ return __ocfs2_wait_on_mount(osb, 0);
+}
- mlog_entry_void();
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+ return __ocfs2_wait_on_mount(osb, 1);
+}
- journal = osb->journal;
+/*
+ * This replay_map is to track online/offline slots, so we could recover
+ * offline slots during recovery and mount
+ */
- /* Flush all pending commits and checkpoint the journal. */
- down_write(&journal->j_trans_barrier);
+enum ocfs2_replay_state {
+ REPLAY_UNNEEDED = 0, /* Replay is not needed, so ignore this map */
+ REPLAY_NEEDED, /* Replay slots marked in rm_replay_slots */
+ REPLAY_DONE /* Replay was already queued */
+};
- if (atomic_read(&journal->j_num_trans) == 0) {
- up_write(&journal->j_trans_barrier);
- mlog(0, "No transactions for me to flush!\n");
- goto finally;
- }
+struct ocfs2_replay_map {
+ unsigned int rm_slots;
+ enum ocfs2_replay_state rm_state;
+ unsigned char rm_replay_slots[0];
+};
- journal_lock_updates(journal->j_journal);
- status = journal_flush(journal->j_journal);
- journal_unlock_updates(journal->j_journal);
- if (status < 0) {
- up_write(&journal->j_trans_barrier);
- mlog_errno(status);
- goto finally;
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+ if (!osb->replay_map)
+ return;
+
+ /* If we've already queued the replay, we don't have any more to do */
+ if (osb->replay_map->rm_state == REPLAY_DONE)
+ return;
+
+ osb->replay_map->rm_state = state;
+}
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+ struct ocfs2_replay_map *replay_map;
+ int i, node_num;
+
+ /* If replay map is already set, we don't do it again */
+ if (osb->replay_map)
+ return 0;
+
+ replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+ (osb->max_slots * sizeof(char)), GFP_KERNEL);
+
+ if (!replay_map) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
- old_id = ocfs2_inc_trans_id(journal);
+ spin_lock(&osb->osb_lock);
- flushed = atomic_read(&journal->j_num_trans);
- atomic_set(&journal->j_num_trans, 0);
- up_write(&journal->j_trans_barrier);
+ replay_map->rm_slots = osb->max_slots;
+ replay_map->rm_state = REPLAY_UNNEEDED;
- mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
- journal->j_trans_id, flushed);
+ /* set rm_replay_slots for offline slot(s) */
+ for (i = 0; i < replay_map->rm_slots; i++) {
+ if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+ replay_map->rm_replay_slots[i] = 1;
+ }
- ocfs2_kick_vote_thread(osb);
- wake_up(&journal->j_checkpointed);
-finally:
- mlog_exit(status);
- return status;
+ osb->replay_map = replay_map;
+ spin_unlock(&osb->osb_lock);
+ return 0;
}
-struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
{
- struct ocfs2_journal_handle *retval = NULL;
-
- retval = kcalloc(1, sizeof(*retval), GFP_NOFS);
- if (!retval) {
- mlog(ML_ERROR, "Failed to allocate memory for journal "
- "handle!\n");
- return NULL;
- }
+ struct ocfs2_replay_map *replay_map = osb->replay_map;
+ int i;
- retval->max_buffs = 0;
- retval->num_locks = 0;
- retval->k_handle = NULL;
+ if (!replay_map)
+ return;
- INIT_LIST_HEAD(&retval->locks);
- INIT_LIST_HEAD(&retval->inode_list);
- retval->journal = osb->journal;
+ if (replay_map->rm_state != REPLAY_NEEDED)
+ return;
- return retval;
+ for (i = 0; i < replay_map->rm_slots; i++)
+ if (replay_map->rm_replay_slots[i])
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+ NULL, NULL);
+ replay_map->rm_state = REPLAY_DONE;
}
-/* pass it NULL and it will allocate a new handle object for you. If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
-struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- int max_buffs)
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{
- int ret;
- journal_t *journal = osb->journal->j_journal;
+ struct ocfs2_replay_map *replay_map = osb->replay_map;
- mlog_entry("(max_buffs = %d)\n", max_buffs);
+ if (!osb->replay_map)
+ return;
- BUG_ON(!osb || !osb->journal->j_journal);
+ kfree(replay_map);
+ osb->replay_map = NULL;
+}
- if (ocfs2_is_hard_readonly(osb)) {
- ret = -EROFS;
- goto done_free;
- }
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
- BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
- BUG_ON(max_buffs <= 0);
+ mutex_init(&osb->recovery_lock);
+ osb->disable_recovery = 0;
+ osb->recovery_thread_task = NULL;
+ init_waitqueue_head(&osb->recovery_event);
- /* JBD might support this, but our journalling code doesn't yet. */
- if (journal_current_handle()) {
- mlog(ML_ERROR, "Recursive transaction attempted!\n");
- BUG();
+ rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+ osb->max_slots * sizeof(unsigned int),
+ GFP_KERNEL);
+ if (!rm) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
- if (!handle)
- handle = ocfs2_alloc_handle(osb);
- if (!handle) {
- ret = -ENOMEM;
- mlog(ML_ERROR, "Failed to allocate memory for journal "
- "handle!\n");
- goto done_free;
- }
+ rm->rm_entries = (unsigned int *)((char *)rm +
+ sizeof(struct ocfs2_recovery_map));
+ osb->recovery_map = rm;
- handle->max_buffs = max_buffs;
+ return 0;
+}
- down_read(&osb->journal->j_trans_barrier);
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+ mb();
+ return osb->recovery_thread_task != NULL;
+}
- /* actually start the transaction now */
- handle->k_handle = journal_start(journal, max_buffs);
- if (IS_ERR(handle->k_handle)) {
- up_read(&osb->journal->j_trans_barrier);
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
- ret = PTR_ERR(handle->k_handle);
- handle->k_handle = NULL;
- mlog_errno(ret);
+ /* disable any new recovery threads and wait for any currently
+ * running ones to exit. Do this before setting the vol_state. */
+ mutex_lock(&osb->recovery_lock);
+ osb->disable_recovery = 1;
+ mutex_unlock(&osb->recovery_lock);
+ wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
- if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
- ret = -EROFS;
- }
- goto done_free;
- }
+ /* At this point, we know that no more recovery threads can be
+ * launched, so wait for any recovery completion work to
+ * complete. */
+ flush_workqueue(ocfs2_wq);
- atomic_inc(&(osb->journal->j_num_trans));
- handle->flags |= OCFS2_HANDLE_STARTED;
+ /*
+ * Now that recovery is shut down, and the osb is about to be
+ * freed, the osb_lock is not taken here.
+ */
+ rm = osb->recovery_map;
+ /* XXX: Should we bug if there are dirty entries? */
- mlog_exit_ptr(handle);
- return handle;
+ kfree(rm);
+}
+
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+ unsigned int node_num)
+{
+ int i;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ assert_spin_locked(&osb->osb_lock);
-done_free:
- if (handle)
- ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
+ for (i = 0; i < rm->rm_used; i++) {
+ if (rm->rm_entries[i] == node_num)
+ return 1;
+ }
- mlog_exit(ret);
- return ERR_PTR(ret);
+ return 0;
}
-void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
- struct inode *inode)
+/* Behaves like test-and-set. Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+ unsigned int node_num)
{
- BUG_ON(!handle);
- BUG_ON(!inode);
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
- atomic_inc(&inode->i_count);
+ spin_lock(&osb->osb_lock);
+ if (__ocfs2_recovery_map_test(osb, node_num)) {
+ spin_unlock(&osb->osb_lock);
+ return 1;
+ }
- /* we're obviously changing it... */
- mutex_lock(&inode->i_mutex);
+ /* XXX: Can this be exploited? Not from o2dlm... */
+ BUG_ON(rm->rm_used >= osb->max_slots);
- /* sanity check */
- BUG_ON(OCFS2_I(inode)->ip_handle);
- BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
+ rm->rm_entries[rm->rm_used] = node_num;
+ rm->rm_used++;
+ spin_unlock(&osb->osb_lock);
- OCFS2_I(inode)->ip_handle = handle;
- list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+ return 0;
}
-static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+ unsigned int node_num)
{
- struct list_head *p, *n;
- struct inode *inode;
- struct ocfs2_inode_info *oi;
+ int i;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
- list_for_each_safe(p, n, &handle->inode_list) {
- oi = list_entry(p, struct ocfs2_inode_info,
- ip_handle_list);
- inode = &oi->vfs_inode;
+ spin_lock(&osb->osb_lock);
- OCFS2_I(inode)->ip_handle = NULL;
- list_del_init(&OCFS2_I(inode)->ip_handle_list);
+ for (i = 0; i < rm->rm_used; i++) {
+ if (rm->rm_entries[i] == node_num)
+ break;
+ }
- mutex_unlock(&inode->i_mutex);
- iput(inode);
+ if (i < rm->rm_used) {
+ /* XXX: be careful with the pointer math */
+ memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+ (rm->rm_used - i - 1) * sizeof(unsigned int));
+ rm->rm_used--;
}
+
+ spin_unlock(&osb->osb_lock);
}
-/* This is trivial so we do it out of the main commit
- * paths. Beware, it can be called from start_trans too! */
-static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
+static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
- mlog_entry_void();
+ int status = 0;
+ unsigned int flushed;
+ struct ocfs2_journal *journal = NULL;
+
+ journal = osb->journal;
+
+ /* Flush all pending commits and checkpoint the journal. */
+ down_write(&journal->j_trans_barrier);
+
+ flushed = atomic_read(&journal->j_num_trans);
+ trace_ocfs2_commit_cache_begin(flushed);
+ if (flushed == 0) {
+ up_write(&journal->j_trans_barrier);
+ goto finally;
+ }
+
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0) {
+ up_write(&journal->j_trans_barrier);
+ mlog_errno(status);
+ goto finally;
+ }
- BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+ ocfs2_inc_trans_id(journal);
- ocfs2_handle_unlock_inodes(handle);
- /* You are allowed to add journal locks before the transaction
- * has started. */
- ocfs2_handle_cleanup_locks(handle->journal, handle);
+ flushed = atomic_read(&journal->j_num_trans);
+ atomic_set(&journal->j_num_trans, 0);
+ up_write(&journal->j_trans_barrier);
- kfree(handle);
+ trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
- mlog_exit_void();
+ ocfs2_wake_downconvert_thread(osb);
+ wake_up(&journal->j_checkpointed);
+finally:
+ return status;
}
-void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
+handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{
- handle_t *jbd_handle;
- int retval;
- struct ocfs2_journal *journal = handle->journal;
+ journal_t *journal = osb->journal->j_journal;
+ handle_t *handle;
- mlog_entry_void();
+ BUG_ON(!osb || !osb->journal->j_journal);
- BUG_ON(!handle);
+ if (ocfs2_is_hard_readonly(osb))
+ return ERR_PTR(-EROFS);
- if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
- ocfs2_commit_unstarted_handle(handle);
- mlog_exit_void();
- return;
- }
+ BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
+ BUG_ON(max_buffs <= 0);
- /* release inode semaphores we took during this transaction */
- ocfs2_handle_unlock_inodes(handle);
+ /* Nested transaction? Just return the handle... */
+ if (journal_current_handle())
+ return jbd2_journal_start(journal, max_buffs);
- /* ocfs2_extend_trans may have had to call journal_restart
- * which will always commit the transaction, but may return
- * error for any number of reasons. If this is the case, we
- * clear k_handle as it's not valid any more. */
- if (handle->k_handle) {
- jbd_handle = handle->k_handle;
+ sb_start_intwrite(osb->sb);
- if (handle->flags & OCFS2_HANDLE_SYNC)
- jbd_handle->h_sync = 1;
- else
- jbd_handle->h_sync = 0;
-
- /* actually stop the transaction. if we've set h_sync,
- * it'll have been committed when we return */
- retval = journal_stop(jbd_handle);
- if (retval < 0) {
- mlog_errno(retval);
- mlog(ML_ERROR, "Could not commit transaction\n");
- BUG();
- }
+ down_read(&osb->journal->j_trans_barrier);
- handle->k_handle = NULL; /* it's been free'd in journal_stop */
+ handle = jbd2_journal_start(journal, max_buffs);
+ if (IS_ERR(handle)) {
+ up_read(&osb->journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
+
+ mlog_errno(PTR_ERR(handle));
+
+ if (is_journal_aborted(journal)) {
+ ocfs2_abort(osb->sb, "Detected aborted journal");
+ handle = ERR_PTR(-EROFS);
+ }
+ } else {
+ if (!ocfs2_mount_local(osb))
+ atomic_inc(&(osb->journal->j_num_trans));
}
- ocfs2_handle_cleanup_locks(journal, handle);
+ return handle;
+}
+
+int ocfs2_commit_trans(struct ocfs2_super *osb,
+ handle_t *handle)
+{
+ int ret, nested;
+ struct ocfs2_journal *journal = osb->journal;
+
+ BUG_ON(!handle);
+
+ nested = handle->h_ref > 1;
+ ret = jbd2_journal_stop(handle);
+ if (ret < 0)
+ mlog_errno(ret);
- up_read(&journal->j_trans_barrier);
+ if (!nested) {
+ up_read(&journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
+ }
- kfree(handle);
- mlog_exit_void();
+ return ret;
}
/*
- * 'nblocks' is what you want to add to the current
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
+ * 'nblocks' is what you want to add to the current transaction.
+ *
+ * This might call jbd2_journal_restart() which will commit dirty buffers
+ * and then restart the transaction. Before calling
+ * ocfs2_extend_trans(), any changed blocks should have been
+ * dirtied. After calling it, all blocks which need to be changed must
+ * go through another set of journal_access/journal_dirty calls.
*
* WARNING: This will not release any semaphores or disk locks taken
* during the transaction, so make sure they were taken *before*
@@ -326,62 +417,247 @@ void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
* good because transaction ids haven't yet been recorded on the
* cluster locks associated with this handle.
*/
-int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
- int nblocks)
+int ocfs2_extend_trans(handle_t *handle, int nblocks)
{
- int status;
+ int status, old_nblocks;
BUG_ON(!handle);
- BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
- BUG_ON(!nblocks);
+ BUG_ON(nblocks < 0);
+
+ if (!nblocks)
+ return 0;
- mlog_entry_void();
+ old_nblocks = handle->h_buffer_credits;
- mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+ trace_ocfs2_extend_trans(old_nblocks, nblocks);
- status = journal_extend(handle->k_handle, nblocks);
+#ifdef CONFIG_OCFS2_DEBUG_FS
+ status = 1;
+#else
+ status = jbd2_journal_extend(handle, nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+#endif
if (status > 0) {
- mlog(0, "journal_extend failed, trying journal_restart\n");
- status = journal_restart(handle->k_handle, nblocks);
+ trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
+ status = jbd2_journal_restart(handle,
+ old_nblocks + nblocks);
if (status < 0) {
- handle->k_handle = NULL;
mlog_errno(status);
goto bail;
}
- handle->max_buffs = nblocks;
- } else
- handle->max_buffs += nblocks;
+ }
status = 0;
bail:
+ return status;
+}
- mlog_exit(status);
+/*
+ * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for metadata modifications.
+ * Taken from Ext4: extend_or_restart_transaction()
+ */
+int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
+{
+ int status, old_nblks;
+
+ BUG_ON(!handle);
+
+ old_nblks = handle->h_buffer_credits;
+ trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
+
+ if (old_nblks < thresh)
+ return 0;
+
+ status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ if (status > 0) {
+ status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
+bail:
return status;
}
-int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *bh,
- int type)
+
+struct ocfs2_triggers {
+ struct jbd2_buffer_trigger_type ot_triggers;
+ int ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+ return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_disk_dqtrailer *dqt =
+ ocfs2_block_dqtrailer(size, data);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_dir_trailer_from_size(size, data);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh)
+{
+ mlog(ML_ERROR,
+ "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
+ "bh->b_blocknr = %llu\n",
+ (unsigned long)bh,
+ (unsigned long long)bh->b_blocknr);
+
+ /* We aren't guaranteed to have the superblock here - but if we
+ * don't, it'll just crash. */
+ ocfs2_error(bh->b_assoc_map->host->i_sb,
+ "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers rb_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_db_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+};
+
+static struct ocfs2_triggers xb_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_dq_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+};
+
+static struct ocfs2_triggers dr_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+
+static struct ocfs2_triggers dl_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh,
+ struct ocfs2_triggers *triggers,
+ int type)
{
int status;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
- BUG_ON(!inode);
+ BUG_ON(!ci || !ci->ci_ops);
BUG_ON(!handle);
BUG_ON(!bh);
- BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
- mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
- (unsigned long long)bh->b_blocknr, type,
- (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
- "OCFS2_JOURNAL_ACCESS_CREATE" :
- "OCFS2_JOURNAL_ACCESS_WRITE",
- bh->b_size);
+ trace_ocfs2_journal_access(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr, type, bh->b_size);
/* we can safely remove this assertion after testing. */
if (!buffer_uptodate(bh)) {
@@ -391,137 +667,128 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
BUG();
}
- /* Set the current transaction information on the inode so
+ /* Set the current transaction information on the ci so
* that the locking code knows whether it can drop it's locks
- * on this inode or not. We're protected from the commit
+ * on this ci or not. We're protected from the commit
* thread updating the current transaction id until
* ocfs2_commit_trans() because ocfs2_start_trans() took
* j_trans_barrier for us. */
- ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+ ocfs2_set_ci_lock_trans(osb->journal, ci);
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
switch (type) {
case OCFS2_JOURNAL_ACCESS_CREATE:
case OCFS2_JOURNAL_ACCESS_WRITE:
- status = journal_get_write_access(handle->k_handle, bh);
+ status = jbd2_journal_get_write_access(handle, bh);
break;
case OCFS2_JOURNAL_ACCESS_UNDO:
- status = journal_get_undo_access(handle->k_handle, bh);
+ status = jbd2_journal_get_undo_access(handle, bh);
break;
default:
status = -EINVAL;
- mlog(ML_ERROR, "Uknown access type!\n");
+ mlog(ML_ERROR, "Unknown access type!\n");
}
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ if (!status && ocfs2_meta_ecc(osb) && triggers)
+ jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
+ ocfs2_metadata_cache_io_unlock(ci);
if (status < 0)
mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
status, type);
- mlog_exit(status);
return status;
}
-int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
- struct buffer_head *bh)
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
{
- int status;
-
- BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
-
- mlog_entry("(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
-
- status = journal_dirty_metadata(handle->k_handle, bh);
- if (status < 0)
- mlog(ML_ERROR, "Could not dirty metadata buffer. "
- "(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
+ return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
+}
- mlog_exit(status);
- return status;
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
}
-int ocfs2_journal_dirty_data(handle_t *handle,
- struct buffer_head *bh)
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
{
- int err = journal_dirty_data(handle, bh);
- if (err)
- mlog_errno(err);
- /* TODO: When we can handle it, abort the handle and go RO on
- * error here. */
+ return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+ type);
+}
- return err;
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
}
-/* We always assume you're adding a metadata lock at level 'ex' */
-int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
- struct inode *inode)
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
{
- int status;
- struct ocfs2_journal_lock *lock;
+ return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
+}
- BUG_ON(!inode);
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
+}
- lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
- if (!lock) {
- status = -ENOMEM;
- mlog_errno(-ENOMEM);
- goto bail;
- }
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
+}
- if (!igrab(inode))
- BUG();
- lock->jl_inode = inode;
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+}
- list_add_tail(&(lock->jl_lock_list), &(handle->locks));
- handle->num_locks++;
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
+}
- status = 0;
-bail:
- mlog_exit(status);
- return status;
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, NULL, type);
}
-static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
- struct ocfs2_journal_handle *handle)
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
{
- struct list_head *p, *n;
- struct ocfs2_journal_lock *lock;
- struct inode *inode;
+ int status;
- list_for_each_safe(p, n, &(handle->locks)) {
- lock = list_entry(p, struct ocfs2_journal_lock,
- jl_lock_list);
- list_del(&lock->jl_lock_list);
- handle->num_locks--;
+ trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
- inode = lock->jl_inode;
- ocfs2_meta_unlock(inode, 1);
- if (atomic_read(&inode->i_count) == 1)
- mlog(ML_ERROR,
- "Inode %llu, I'm doing a last iput for!",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- iput(inode);
- kmem_cache_free(ocfs2_lock_cache, lock);
- }
+ status = jbd2_journal_dirty_metadata(handle, bh);
+ BUG_ON(status);
}
-#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
{
journal_t *journal = osb->journal->j_journal;
+ unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
- spin_lock(&journal->j_state_lock);
- journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
+ if (osb->osb_commit_interval)
+ commit_interval = osb->osb_commit_interval;
+
+ write_lock(&journal->j_state_lock);
+ journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
- journal->j_flags |= JFS_BARRIER;
+ journal->j_flags |= JBD2_BARRIER;
else
- journal->j_flags &= ~JFS_BARRIER;
- spin_unlock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_BARRIER;
+ write_unlock(&journal->j_state_lock);
}
int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -532,9 +799,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
struct ocfs2_dinode *di = NULL;
struct buffer_head *bh = NULL;
struct ocfs2_super *osb;
- int meta_lock = 0;
-
- mlog_entry_void();
+ int inode_lock = 0;
BUG_ON(!journal);
@@ -562,39 +827,36 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
/* Skip recovery waits here - journal inode metadata never
* changes in a live cluster so it can be considered an
* exception to the rule. */
- status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
- OCFS2_META_LOCK_RECOVERY);
+ status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not get lock on journal!\n");
goto done;
}
- meta_lock = 1;
+ inode_lock = 1;
di = (struct ocfs2_dinode *)bh->b_data;
- if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
+ if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) {
mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
- inode->i_size);
+ i_size_read(inode));
status = -EINVAL;
goto done;
}
- mlog(0, "inode->i_size = %lld\n", inode->i_size);
- mlog(0, "inode->i_blocks = %llu\n",
- (unsigned long long)inode->i_blocks);
- mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+ trace_ocfs2_journal_init(i_size_read(inode),
+ (unsigned long long)inode->i_blocks,
+ OCFS2_I(inode)->ip_clusters);
/* call the kernels journal init function now */
- j_journal = journal_init_inode(inode);
+ j_journal = jbd2_journal_init_inode(inode);
if (j_journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
status = -EINVAL;
goto done;
}
- mlog(0, "Returned from journal_init_inode\n");
- mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+ trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL);
@@ -610,22 +872,30 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
status = 0;
done:
if (status < 0) {
- if (meta_lock)
- ocfs2_meta_unlock(inode, 1);
- if (bh != NULL)
- brelse(bh);
+ if (inode_lock)
+ ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
if (inode) {
OCFS2_I(inode)->ip_open_count--;
iput(inode);
}
}
- mlog_exit(status);
return status;
}
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+ le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+ return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
+
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
- int dirty)
+ int dirty, int replayed)
{
int status;
unsigned int flags;
@@ -633,19 +903,12 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
struct buffer_head *bh = journal->j_bh;
struct ocfs2_dinode *fe;
- mlog_entry_void();
-
fe = (struct ocfs2_dinode *)bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- /* This is called from startup/shutdown which will
- * handle the errors in a specific manner, so no need
- * to call ocfs2_error() here. */
- mlog(ML_ERROR, "Journal dinode %llu has invalid "
- "signature: %.*s", (unsigned long long)fe->i_blkno, 7,
- fe->i_signature);
- status = -EIO;
- goto out;
- }
+
+ /* The journal bh on the osb always comes from ocfs2_journal_init()
+ * and was validated there inside ocfs2_inode_lock_full(). It's a
+ * code bug if we mess it up. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
flags = le32_to_cpu(fe->id1.journal1.ij_flags);
if (dirty)
@@ -654,12 +917,14 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
flags &= ~OCFS2_JOURNAL_DIRTY_FL;
fe->id1.journal1.ij_flags = cpu_to_le32(flags);
- status = ocfs2_write_block(osb, bh, journal->j_inode);
+ if (replayed)
+ ocfs2_bump_recovery_generation(fe);
+
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
if (status < 0)
mlog_errno(status);
-out:
- mlog_exit(status);
return status;
}
@@ -674,8 +939,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
struct inode *inode = NULL;
int num_running_trans = 0;
- mlog_entry_void();
-
BUG_ON(!osb);
journal = osb->journal;
@@ -687,15 +950,12 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (journal->j_state != OCFS2_JOURNAL_LOADED)
goto done;
- /* need to inc inode use count as journal_destroy will iput. */
+ /* need to inc inode use count - jbd2_journal_destroy will iput. */
if (!igrab(inode))
BUG();
num_running_trans = atomic_read(&(osb->journal->j_num_trans));
- if (num_running_trans > 0)
- mlog(0, "Shutting down journal: must wait on %d "
- "running transactions!\n",
- num_running_trans);
+ trace_ocfs2_journal_shutdown(num_running_trans);
/* Do a commit_cache here. It will flush our journal, *and*
* release any locks that are still held.
@@ -708,24 +968,39 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
* completely destroy the journal. */
if (osb->commit_task) {
/* Wait for the commit thread */
- mlog(0, "Waiting for ocfs2commit to exit....\n");
+ trace_ocfs2_journal_shutdown_wait(osb->commit_task);
kthread_stop(osb->commit_task);
osb->commit_task = NULL;
}
BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
- status = ocfs2_journal_toggle_dirty(osb, 0);
- if (status < 0)
- mlog_errno(status);
+ if (ocfs2_mount_local(osb)) {
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
+ if (status == 0) {
+ /*
+ * Do not toggle if flush was unsuccessful otherwise
+ * will leave dirty metadata in a "clean" journal
+ */
+ status = ocfs2_journal_toggle_dirty(osb, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+ }
/* Shutdown the kernel journal system */
- journal_destroy(journal->j_journal);
+ jbd2_journal_destroy(journal->j_journal);
+ journal->j_journal = NULL;
OCFS2_I(inode)->ip_open_count--;
/* unlock our journal */
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
brelse(journal->j_bh);
journal->j_bh = NULL;
@@ -736,7 +1011,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
done:
if (inode)
iput(inode);
- mlog_exit_void();
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -745,31 +1019,28 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
{
int olderr;
- olderr = journal_errno(journal);
+ olderr = jbd2_journal_errno(journal);
if (olderr) {
mlog(ML_ERROR, "File system error %d recorded in "
"journal %u.\n", olderr, slot);
mlog(ML_ERROR, "File system on device %s needs checking.\n",
sb->s_id);
- journal_ack_err(journal);
- journal_clear_err(journal);
+ jbd2_journal_ack_err(journal);
+ jbd2_journal_clear_err(journal);
}
}
-int ocfs2_journal_load(struct ocfs2_journal *journal)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
{
int status = 0;
struct ocfs2_super *osb;
- mlog_entry_void();
-
- if (!journal)
- BUG();
+ BUG_ON(!journal);
osb = journal->j_osb;
- status = journal_load(journal->j_journal);
+ status = jbd2_journal_load(journal->j_journal);
if (status < 0) {
mlog(ML_ERROR, "Failed to load journal!\n");
goto done;
@@ -777,24 +1048,27 @@ int ocfs2_journal_load(struct ocfs2_journal *journal)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
- status = ocfs2_journal_toggle_dirty(osb, 1);
+ status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
goto done;
}
/* Launch the commit thread */
- osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt");
- if (IS_ERR(osb->commit_task)) {
- status = PTR_ERR(osb->commit_task);
+ if (!local) {
+ osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
+ "ocfs2cmt");
+ if (IS_ERR(osb->commit_task)) {
+ status = PTR_ERR(osb->commit_task);
+ osb->commit_task = NULL;
+ mlog(ML_ERROR, "unable to launch ocfs2commit thread, "
+ "error=%d", status);
+ goto done;
+ }
+ } else
osb->commit_task = NULL;
- mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
- status);
- goto done;
- }
done:
- mlog_exit(status);
return status;
}
@@ -805,25 +1079,39 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
{
int status;
- mlog_entry_void();
-
BUG_ON(!journal);
- status = journal_wipe(journal->j_journal, full);
+ status = jbd2_journal_wipe(journal->j_journal, full);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+ status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
if (status < 0)
mlog_errno(status);
bail:
- mlog_exit(status);
return status;
}
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+ int empty;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+ spin_lock(&osb->osb_lock);
+ empty = (rm->rm_used == 0);
+ spin_unlock(&osb->osb_lock);
+
+ return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+ wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
/*
* JBD Might read a cached version of another nodes journal file. We
* don't want this as this file changes often and we get no
@@ -837,29 +1125,18 @@ bail:
static int ocfs2_force_read_journal(struct inode *inode)
{
int status = 0;
- int i, p_blocks;
- u64 v_blkno, p_blkno;
-#define CONCURRENT_JOURNAL_FILL 32
+ int i;
+ u64 v_blkno, p_blkno, p_blocks, num_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
- mlog_entry_void();
-
- BUG_ON(inode->i_blocks !=
- ocfs2_align_bytes_to_sectors(i_size_read(inode)));
-
memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
- mlog(0, "Force reading %llu blocks\n",
- (unsigned long long)(inode->i_blocks >>
- (inode->i_sb->s_blocksize_bits - 9)));
-
+ num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
v_blkno = 0;
- while (v_blkno <
- (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
-
+ while (v_blkno < num_blocks) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
- 1, &p_blkno,
- &p_blocks);
+ &p_blkno, &p_blocks, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -870,9 +1147,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
/* We are reading journal data which should not
* be put in the uptodate cache */
- status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
- p_blkno, p_blocks, bhs, 0,
- NULL);
+ status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+ p_blkno, p_blocks, bhs);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -888,9 +1164,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
bail:
for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
- if (bhs[i])
- brelse(bhs[i]);
- mlog_exit(status);
+ brelse(bhs[i]);
return status;
}
@@ -899,6 +1173,7 @@ struct ocfs2_la_recovery_item {
int lri_slot;
struct ocfs2_dinode *lri_la_dinode;
struct ocfs2_dinode *lri_tl_dinode;
+ struct ocfs2_quota_recovery *lri_qrec;
};
/* Does the second half of the recovery process. By this point, the
@@ -911,35 +1186,39 @@ struct ocfs2_la_recovery_item {
* NOTE: This function can and will sleep on recovery of other nodes
* during cluster locking, just like any other ocfs2 process.
*/
-void ocfs2_complete_recovery(void *data)
+void ocfs2_complete_recovery(struct work_struct *work)
{
- int ret;
- struct ocfs2_super *osb = data;
- struct ocfs2_journal *journal = osb->journal;
+ int ret = 0;
+ struct ocfs2_journal *journal =
+ container_of(work, struct ocfs2_journal, j_recovery_work);
+ struct ocfs2_super *osb = journal->j_osb;
struct ocfs2_dinode *la_dinode, *tl_dinode;
- struct ocfs2_la_recovery_item *item;
- struct list_head *p, *n;
+ struct ocfs2_la_recovery_item *item, *n;
+ struct ocfs2_quota_recovery *qrec;
LIST_HEAD(tmp_la_list);
- mlog_entry_void();
-
- mlog(0, "completing recovery from keventd\n");
+ trace_ocfs2_complete_recovery(
+ (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
spin_lock(&journal->j_lock);
list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
spin_unlock(&journal->j_lock);
- list_for_each_safe(p, n, &tmp_la_list) {
- item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
+ list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
list_del_init(&item->lri_list);
- mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+ ocfs2_wait_on_quotas(osb);
la_dinode = item->lri_la_dinode;
- if (la_dinode) {
- mlog(0, "Clean up local alloc %llu\n",
- (unsigned long long)la_dinode->i_blkno);
+ tl_dinode = item->lri_tl_dinode;
+ qrec = item->lri_qrec;
+
+ trace_ocfs2_complete_recovery_slot(item->lri_slot,
+ la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+ tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+ qrec);
+ if (la_dinode) {
ret = ocfs2_complete_local_alloc_recovery(osb,
la_dinode);
if (ret < 0)
@@ -948,11 +1227,7 @@ void ocfs2_complete_recovery(void *data)
kfree(la_dinode);
}
- tl_dinode = item->lri_tl_dinode;
if (tl_dinode) {
- mlog(0, "Clean up truncate log %llu\n",
- (unsigned long long)tl_dinode->i_blkno);
-
ret = ocfs2_complete_truncate_log_recovery(osb,
tl_dinode);
if (ret < 0)
@@ -965,11 +1240,18 @@ void ocfs2_complete_recovery(void *data)
if (ret < 0)
mlog_errno(ret);
+ if (qrec) {
+ ret = ocfs2_finish_quota_recovery(osb, qrec,
+ item->lri_slot);
+ if (ret < 0)
+ mlog_errno(ret);
+ /* Recovery info is already freed now */
+ }
+
kfree(item);
}
- mlog(0, "Recovery completion\n");
- mlog_exit_void();
+ trace_ocfs2_complete_recovery_end(ret);
}
/* NOTE: This function always eats your references to la_dinode and
@@ -978,7 +1260,8 @@ void ocfs2_complete_recovery(void *data)
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
- struct ocfs2_dinode *tl_dinode)
+ struct ocfs2_dinode *tl_dinode,
+ struct ocfs2_quota_recovery *qrec)
{
struct ocfs2_la_recovery_item *item;
@@ -987,11 +1270,11 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
/* Though we wish to avoid it, we are in fact safe in
* skipping local alloc cleanup as fsck.ocfs2 is more
* than capable of reclaiming unused space. */
- if (la_dinode)
- kfree(la_dinode);
+ kfree(la_dinode);
+ kfree(tl_dinode);
- if (tl_dinode)
- kfree(tl_dinode);
+ if (qrec)
+ ocfs2_free_quota_recovery(qrec);
mlog_errno(-ENOMEM);
return;
@@ -1001,6 +1284,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
item->lri_la_dinode = la_dinode;
item->lri_slot = slot_num;
item->lri_tl_dinode = tl_dinode;
+ item->lri_qrec = qrec;
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1009,37 +1293,60 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
}
/* Called by the mount code to queue recovery the last part of
- * recovery for it's own slot. */
+ * recovery for it's own and offline slot(s). */
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
{
struct ocfs2_journal *journal = osb->journal;
- if (osb->dirty) {
- /* No need to queue up our truncate_log as regular
- * cleanup will catch that. */
- ocfs2_queue_recovery_completion(journal,
- osb->slot_num,
- osb->local_alloc_copy,
- NULL);
- ocfs2_schedule_truncate_log_flush(osb, 0);
+ if (ocfs2_is_hard_readonly(osb))
+ return;
+
+ /* No need to queue up our truncate_log as regular cleanup will catch
+ * that */
+ ocfs2_queue_recovery_completion(journal, osb->slot_num,
+ osb->local_alloc_copy, NULL, NULL);
+ ocfs2_schedule_truncate_log_flush(osb, 0);
+
+ osb->local_alloc_copy = NULL;
+ osb->dirty = 0;
+
+ /* queue to recover orphan slots for all offline slots */
+ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+ ocfs2_queue_replay_slots(osb);
+ ocfs2_free_replay_slots(osb);
+}
- osb->local_alloc_copy = NULL;
- osb->dirty = 0;
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+ if (osb->quota_rec) {
+ ocfs2_queue_recovery_completion(osb->journal,
+ osb->slot_num,
+ NULL,
+ NULL,
+ osb->quota_rec);
+ osb->quota_rec = NULL;
}
}
static int __ocfs2_recovery_thread(void *arg)
{
- int status, node_num;
+ int status, node_num, slot_num;
struct ocfs2_super *osb = arg;
-
- mlog_entry_void();
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+ int *rm_quota = NULL;
+ int rm_quota_used = 0, i;
+ struct ocfs2_quota_recovery *qrec;
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
}
+ rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
restart:
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
@@ -1047,49 +1354,95 @@ restart:
goto bail;
}
- while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
- node_num = ocfs2_node_map_first_set_bit(osb,
- &osb->recovery_map);
- if (node_num == O2NM_INVALID_NODE_NUM) {
- mlog(0, "Out of nodes to recover.\n");
- break;
+ status = ocfs2_compute_replay_slots(osb);
+ if (status < 0)
+ mlog_errno(status);
+
+ /* queue recovery for our own slot */
+ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+ NULL, NULL);
+
+ spin_lock(&osb->osb_lock);
+ while (rm->rm_used) {
+ /* It's always safe to remove entry zero, as we won't
+ * clear it until ocfs2_recover_node() has succeeded. */
+ node_num = rm->rm_entries[0];
+ spin_unlock(&osb->osb_lock);
+ slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ trace_ocfs2_recovery_thread_node(node_num, slot_num);
+ if (slot_num == -ENOENT) {
+ status = 0;
+ goto skip_recovery;
}
- status = ocfs2_recover_node(osb, node_num);
- if (status < 0) {
+ /* It is a bit subtle with quota recovery. We cannot do it
+ * immediately because we have to obtain cluster locks from
+ * quota files and we also don't want to just skip it because
+ * then quota usage would be out of sync until some node takes
+ * the slot. So we remember which nodes need quota recovery
+ * and when everything else is done, we recover quotas. */
+ for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+
+ status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
+ if (!status) {
+ ocfs2_recovery_map_clear(osb, node_num);
+ } else {
mlog(ML_ERROR,
"Error %d recovering node %d on device (%u,%u)!\n",
status, node_num,
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
mlog(ML_ERROR, "Volume requires unmount.\n");
- continue;
}
- ocfs2_recovery_map_clear(osb, node_num);
+ spin_lock(&osb->osb_lock);
+ }
+ spin_unlock(&osb->osb_lock);
+ trace_ocfs2_recovery_thread_end(status);
+
+ /* Refresh all journal recovery generations from disk */
+ status = ocfs2_check_journals_nolocks(osb);
+ status = (status == -EROFS) ? 0 : status;
+ if (status < 0)
+ mlog_errno(status);
+
+ /* Now it is right time to recover quotas... We have to do this under
+ * superblock lock so that no one can start using the slot (and crash)
+ * before we recover it */
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+ NULL, NULL, qrec);
}
+
ocfs2_super_unlock(osb, 1);
- /* We always run recovery on our own orphan dir - the dead
- * node(s) may have voted "no" on an inode delete earlier. A
- * revote is therefore required. */
- ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
- NULL);
+ /* queue recovery for offline slots */
+ ocfs2_queue_replay_slots(osb);
bail:
mutex_lock(&osb->recovery_lock);
- if (!status &&
- !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+ if (!status && !ocfs2_recovery_completed(osb)) {
mutex_unlock(&osb->recovery_lock);
goto restart;
}
+ ocfs2_free_replay_slots(osb);
osb->recovery_thread_task = NULL;
mb(); /* sync with ocfs2_recovery_thread_running */
wake_up(&osb->recovery_event);
mutex_unlock(&osb->recovery_lock);
- mlog_exit(status);
+ kfree(rm_quota);
+
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
* complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1099,19 +1452,15 @@ bail:
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{
- mlog_entry("(node_num=%d, osb->node_num = %d)\n",
- node_num, osb->node_num);
-
mutex_lock(&osb->recovery_lock);
- if (osb->disable_recovery)
- goto out;
- /* People waiting on recovery will wait on
- * the recovery map to empty. */
- if (!ocfs2_recovery_map_set(osb, node_num))
- mlog(0, "node %d already be in recovery.\n", node_num);
+ trace_ocfs2_recovery_thread(node_num, osb->node_num,
+ osb->disable_recovery, osb->recovery_thread_task,
+ osb->disable_recovery ?
+ -1 : ocfs2_recovery_map_set(osb, node_num));
- mlog(0, "starting recovery thread...\n");
+ if (osb->disable_recovery)
+ goto out;
if (osb->recovery_thread_task)
goto out;
@@ -1126,8 +1475,42 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
out:
mutex_unlock(&osb->recovery_lock);
wake_up(&osb->recovery_event);
+}
+
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+ int slot_num,
+ struct buffer_head **bh,
+ struct inode **ret_inode)
+{
+ int status = -EACCES;
+ struct inode *inode = NULL;
+
+ BUG_ON(slot_num >= osb->max_slots);
- mlog_exit_void();
+ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+ slot_num);
+ if (!inode || is_bad_inode(inode)) {
+ mlog_errno(status);
+ goto bail;
+ }
+ SET_INODE_JOURNAL(inode);
+
+ status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = 0;
+
+bail:
+ if (inode) {
+ if (status || !ret_inode)
+ iput(inode);
+ else
+ *ret_inode = inode;
+ }
+ return status;
}
/* Does the actual journal replay and marks the journal inode as
@@ -1143,27 +1526,40 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
struct ocfs2_dinode *fe;
journal_t *journal = NULL;
struct buffer_head *bh = NULL;
+ u32 slot_reco_gen;
- inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
- slot_num);
- if (inode == NULL) {
- status = -EACCES;
+ status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
+ if (status) {
mlog_errno(status);
goto done;
}
- if (is_bad_inode(inode)) {
- status = -EACCES;
- iput(inode);
- inode = NULL;
- mlog_errno(status);
+
+ fe = (struct ocfs2_dinode *)bh->b_data;
+ slot_reco_gen = ocfs2_get_recovery_generation(fe);
+ brelse(bh);
+ bh = NULL;
+
+ /*
+ * As the fs recovery is asynchronous, there is a small chance that
+ * another node mounted (and recovered) the slot before the recovery
+ * thread could get the lock. To handle that, we dirty read the journal
+ * inode for that slot to get the recovery generation. If it is
+ * different than what we expected, the slot has been recovered.
+ * If not, it needs recovery.
+ */
+ if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+ trace_ocfs2_replay_journal_recovered(slot_num,
+ osb->slot_recovery_generations[slot_num], slot_reco_gen);
+ osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+ status = -EBUSY;
goto done;
}
- SET_INODE_JOURNAL(inode);
- status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
- OCFS2_META_LOCK_RECOVERY);
+ /* Continue with recovery as the journal has not yet been recovered */
+
+ status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
- mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
+ trace_ocfs2_replay_journal_lock_err(status);
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not lock journal!\n");
goto done;
@@ -1173,15 +1569,21 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
fe = (struct ocfs2_dinode *) bh->b_data;
flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+ slot_reco_gen = ocfs2_get_recovery_generation(fe);
if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
- mlog(0, "No recovery required for node %d\n", node_num);
+ trace_ocfs2_replay_journal_skip(node_num);
+ /* Refresh recovery generation for the slot */
+ osb->slot_recovery_generations[slot_num] = slot_reco_gen;
goto done;
}
- mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
- node_num, slot_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ /* we need to run complete recovery for offline orphan slots */
+ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+
+ printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1191,30 +1593,28 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
goto done;
}
- mlog(0, "calling journal_init_inode\n");
- journal = journal_init_inode(inode);
+ journal = jbd2_journal_init_inode(inode);
if (journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
status = -EIO;
goto done;
}
- status = journal_load(journal);
+ status = jbd2_journal_load(journal);
if (status < 0) {
mlog_errno(status);
if (!igrab(inode))
BUG();
- journal_destroy(journal);
+ jbd2_journal_destroy(journal);
goto done;
}
ocfs2_clear_journal_error(osb->sb, journal, slot_num);
/* wipe the journal */
- mlog(0, "flushing the journal.\n");
- journal_lock_updates(journal);
- status = journal_flush(journal);
- journal_unlock_updates(journal);
+ jbd2_journal_lock_updates(journal);
+ status = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
if (status < 0)
mlog_errno(status);
@@ -1223,27 +1623,34 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
flags &= ~OCFS2_JOURNAL_DIRTY_FL;
fe->id1.journal1.ij_flags = cpu_to_le32(flags);
- status = ocfs2_write_block(osb, bh, inode);
+ /* Increment recovery generation to indicate successful recovery */
+ ocfs2_bump_recovery_generation(fe);
+ osb->slot_recovery_generations[slot_num] =
+ ocfs2_get_recovery_generation(fe);
+
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
if (status < 0)
mlog_errno(status);
if (!igrab(inode))
BUG();
- journal_destroy(journal);
+ jbd2_journal_destroy(journal);
+ printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
done:
/* drop the lock on this nodes journal */
if (got_lock)
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
if (inode)
iput(inode);
- if (bh)
- brelse(bh);
+ brelse(bh);
- mlog_exit(status);
return status;
}
@@ -1260,34 +1667,25 @@ done:
* far less concerning.
*/
static int ocfs2_recover_node(struct ocfs2_super *osb,
- int node_num)
+ int node_num, int slot_num)
{
int status = 0;
- int slot_num;
- struct ocfs2_slot_info *si = osb->slot_info;
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;
- mlog_entry("(node_num=%d, osb->node_num = %d)\n",
- node_num, osb->node_num);
-
- mlog(0, "checking node %d\n", node_num);
+ trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
/* Should not ever be called to recover ourselves -- in that
* case we should've called ocfs2_journal_load instead. */
BUG_ON(osb->node_num == node_num);
- slot_num = ocfs2_node_num_to_slot(si, node_num);
- if (slot_num == OCFS2_INVALID_SLOT) {
- status = 0;
- mlog(0, "no slot for this node, so no recovery required.\n");
- goto done;
- }
-
- mlog(0, "node %d was using slot %d\n", node_num, slot_num);
-
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
+ if (status == -EBUSY) {
+ trace_ocfs2_recover_node_skip(slot_num, node_num);
+ status = 0;
+ goto done;
+ }
mlog_errno(status);
goto done;
}
@@ -1308,19 +1706,17 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* Likewise, this would be a strange but ultimately not so
* harmful place to get an error... */
- ocfs2_clear_slot(si, slot_num);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_clear_slot(osb, slot_num);
if (status < 0)
mlog_errno(status);
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
- tl_copy);
+ tl_copy, NULL);
status = 0;
done:
- mlog_exit(status);
return status;
}
@@ -1350,14 +1746,14 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
SET_INODE_JOURNAL(inode);
flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
- status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
+ status = ocfs2_inode_lock_full(inode, NULL, 1, flags);
if (status < 0) {
if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail:
if (inode)
iput(inode);
@@ -1369,23 +1765,49 @@ bail:
* slot info struct has been updated from disk. */
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
{
- int status, i, node_num;
- struct ocfs2_slot_info *si = osb->slot_info;
+ unsigned int node_num;
+ int status, i;
+ u32 gen;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *di;
/* This is called with the super block cluster lock, so we
* know that the slot map can't change underneath us. */
- spin_lock(&si->si_lock);
- for(i = 0; i < si->si_num_slots; i++) {
- if (i == osb->slot_num)
+ for (i = 0; i < osb->max_slots; i++) {
+ /* Read journal inode to get the recovery generation */
+ status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ di = (struct ocfs2_dinode *)bh->b_data;
+ gen = ocfs2_get_recovery_generation(di);
+ brelse(bh);
+ bh = NULL;
+
+ spin_lock(&osb->osb_lock);
+ osb->slot_recovery_generations[i] = gen;
+
+ trace_ocfs2_mark_dead_nodes(i,
+ osb->slot_recovery_generations[i]);
+
+ if (i == osb->slot_num) {
+ spin_unlock(&osb->osb_lock);
continue;
- if (ocfs2_is_empty_slot(si, i))
+ }
+
+ status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+ if (status == -ENOENT) {
+ spin_unlock(&osb->osb_lock);
continue;
+ }
- node_num = si->si_global_node_nums[i];
- if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+ if (__ocfs2_recovery_map_test(osb, node_num)) {
+ spin_unlock(&osb->osb_lock);
continue;
- spin_unlock(&si->si_lock);
+ }
+ spin_unlock(&osb->osb_lock);
/* Ok, we have a slot occupied by another node which
* is not in the recovery map. We trylock his journal
@@ -1400,28 +1822,203 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
mlog_errno(status);
goto bail;
}
-
- spin_lock(&si->si_lock);
}
- spin_unlock(&si->si_lock);
status = 0;
bail:
- mlog_exit(status);
return status;
}
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+ unsigned long time;
+
+ get_random_bytes(&time, sizeof(time));
+ time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+ return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ * Node 1 has an inode it was using. The dentry went away due to memory
+ * pressure. Node 1 closes the inode, but it's on the free list. The node
+ * has the open lock.
+ * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ * but node 1 has no dentry and doesn't get the message. It trylocks the
+ * open lock, sees that another node has a PR, and does nothing.
+ * Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ * open lock, sees the PR still, and does nothing.
+ * Basically, we have to trigger an orphan iput on node 1. The only way
+ * for this to happen is if node 1 runs node 2's orphan dir.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds. It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan. This node skips the scan and tracks the
+ * sequence number. If the sequence number didn't change, it means a scan
+ * hasn't happened. The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+ int status, i;
+ u32 seqno = 0;
+
+ os = &osb->osb_orphan_scan;
+
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+ goto out;
+
+ trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+ atomic_read(&os->os_state));
+
+ status = ocfs2_orphan_scan_lock(osb, &seqno);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Do no queue the tasks if the volume is being umounted */
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+ goto unlock;
+
+ if (os->os_seqno != seqno) {
+ os->os_seqno = seqno;
+ goto unlock;
+ }
+
+ for (i = 0; i < osb->max_slots; i++)
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+ NULL);
+ /*
+ * We queued a recovery on orphan slots, increment the sequence
+ * number and update LVB so other node will skip the scan for a while
+ */
+ seqno++;
+ os->os_count++;
+ os->os_scantime = CURRENT_TIME;
+unlock:
+ ocfs2_orphan_scan_unlock(osb, seqno);
+out:
+ trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+ atomic_read(&os->os_state));
+ return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+ struct ocfs2_orphan_scan *os;
+ struct ocfs2_super *osb;
+
+ os = container_of(work, struct ocfs2_orphan_scan,
+ os_orphan_scan_work.work);
+ osb = os->os_osb;
+
+ mutex_lock(&os->os_lock);
+ ocfs2_queue_orphan_scan(osb);
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
+ atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+ mutex_lock(&os->os_lock);
+ cancel_delayed_work(&os->os_orphan_scan_work);
+ mutex_unlock(&os->os_lock);
+ }
+}
+
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_osb = osb;
+ os->os_count = 0;
+ os->os_seqno = 0;
+ mutex_init(&os->os_lock);
+ INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_scantime = CURRENT_TIME;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+ atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+ else {
+ atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ }
+}
+
+struct ocfs2_orphan_filldir_priv {
+ struct dir_context ctx;
+ struct inode *head;
+ struct ocfs2_super *osb;
+};
+
+static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
+ loff_t pos, u64 ino, unsigned type)
+{
+ struct ocfs2_orphan_filldir_priv *p = priv;
+ struct inode *iter;
+
+ if (name_len == 1 && !strncmp(".", name, 1))
+ return 0;
+ if (name_len == 2 && !strncmp("..", name, 2))
+ return 0;
+
+ /* Skip bad inodes so that recovery can continue */
+ iter = ocfs2_iget(p->osb, ino,
+ OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
+ if (IS_ERR(iter))
+ return 0;
+
+ trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
+ /* No locking is required for the next_orphan queue as there
+ * is only ever a single process doing orphan recovery. */
+ OCFS2_I(iter)->ip_next_orphan = p->head;
+ p->head = iter;
+
+ return 0;
+}
+
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
int slot,
struct inode **head)
{
int status;
struct inode *orphan_dir_inode = NULL;
- struct inode *iter;
- unsigned long offset, blk, local;
- struct buffer_head *bh = NULL;
- struct ocfs2_dir_entry *de;
- struct super_block *sb = osb->sb;
+ struct ocfs2_orphan_filldir_priv priv = {
+ .ctx.actor = ocfs2_orphan_filldir,
+ .osb = osb,
+ .head = *head
+ };
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
@@ -1430,87 +2027,25 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
status = -ENOENT;
mlog_errno(status);
return status;
- }
+ }
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
+ status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
goto out;
}
- offset = 0;
- iter = NULL;
- while(offset < i_size_read(orphan_dir_inode)) {
- blk = offset >> sb->s_blocksize_bits;
-
- bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
- if (!bh)
- status = -EINVAL;
- if (status < 0) {
- if (bh)
- brelse(bh);
- mlog_errno(status);
- goto out_unlock;
- }
-
- local = 0;
- while(offset < i_size_read(orphan_dir_inode)
- && local < sb->s_blocksize) {
- de = (struct ocfs2_dir_entry *) (bh->b_data + local);
-
- if (!ocfs2_check_dir_entry(orphan_dir_inode,
- de, bh, local)) {
- status = -EINVAL;
- mlog_errno(status);
- brelse(bh);
- goto out_unlock;
- }
-
- local += le16_to_cpu(de->rec_len);
- offset += le16_to_cpu(de->rec_len);
-
- /* I guess we silently fail on no inode? */
- if (!le64_to_cpu(de->inode))
- continue;
- if (de->file_type > OCFS2_FT_MAX) {
- mlog(ML_ERROR,
- "block %llu contains invalid de: "
- "inode = %llu, rec_len = %u, "
- "name_len = %u, file_type = %u, "
- "name='%.*s'\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(de->inode),
- le16_to_cpu(de->rec_len),
- de->name_len,
- de->file_type,
- de->name_len,
- de->name);
- continue;
- }
- if (de->name_len == 1 && !strncmp(".", de->name, 1))
- continue;
- if (de->name_len == 2 && !strncmp("..", de->name, 2))
- continue;
-
- iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
- OCFS2_FI_FLAG_NOLOCK);
- if (IS_ERR(iter))
- continue;
-
- mlog(0, "queue orphan %llu\n",
- (unsigned long long)OCFS2_I(iter)->ip_blkno);
- /* No locking is required for the next_orphan
- * queue as there is only ever a single
- * process doing orphan recovery. */
- OCFS2_I(iter)->ip_next_orphan = *head;
- *head = iter;
- }
- brelse(bh);
+ status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
+ if (status) {
+ mlog_errno(status);
+ goto out_cluster;
}
-out_unlock:
- ocfs2_meta_unlock(orphan_dir_inode, 0);
+ *head = priv.head;
+
+out_cluster:
+ ocfs2_inode_unlock(orphan_dir_inode, 0);
out:
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
@@ -1579,7 +2114,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
struct inode *iter;
struct ocfs2_inode_info *oi;
- mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+ trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -1592,21 +2127,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
while (inode) {
oi = OCFS2_I(inode);
- mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
+ trace_ocfs2_recover_orphans_iput(
+ (unsigned long long)oi->ip_blkno);
iter = oi->ip_next_orphan;
spin_lock(&oi->ip_lock);
- /* Delete voting may have set these on the assumption
- * that the other node would wipe them successfully.
- * If they are still in the node's orphan dir, we need
- * to reset that state. */
- oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
-
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock);
iput(inode);
@@ -1617,19 +2146,21 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
return ret;
}
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
{
/* This check is good because ocfs2 will wait on our recovery
* thread before changing it to something other than MOUNTED
* or DISABLED. */
wait_event(osb->osb_mount_event,
- atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+ (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+ atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
atomic_read(&osb->vol_state) == VOLUME_DISABLED);
/* If there's an error on mount, then we may never get to the
* MOUNTED flag, but this is set right before
* dismount_volume() so we can trust it. */
if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+ trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
mlog(0, "mount error, exiting!\n");
return -EBUSY;
}
@@ -1655,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)
|| kthread_should_stop());
status = ocfs2_commit_cache(osb);
- if (status < 0)
- mlog_errno(status);
+ if (status < 0) {
+ static unsigned long abort_warn_time;
+
+ /* Warn about this once per minute */
+ if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+ mlog(ML_ERROR, "status = %d, journal is "
+ "already aborted.\n", status);
+ /*
+ * After ocfs2_commit_cache() fails, j_num_trans has a
+ * non-zero value. Sleep here to avoid a busy-wait
+ * loop.
+ */
+ msleep_interruptible(1000);
+ }
if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
mlog(ML_KTHREAD,
@@ -1669,49 +2212,41 @@ static int ocfs2_commit_thread(void *arg)
return 0;
}
-/* Look for a dirty journal without taking any cluster locks. Used for
- * hard readonly access to determine whether the file system journals
- * require recovery. */
+/* Reads all the journal inodes without taking any cluster locks. Used
+ * for hard readonly access to determine whether any journal requires
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
{
int ret = 0;
unsigned int slot;
- struct buffer_head *di_bh;
+ struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- struct inode *journal = NULL;
+ int journal_dirty = 0;
for(slot = 0; slot < osb->max_slots; slot++) {
- journal = ocfs2_get_system_file_inode(osb,
- JOURNAL_SYSTEM_INODE,
- slot);
- if (!journal || is_bad_inode(journal)) {
- ret = -EACCES;
- mlog_errno(ret);
- goto out;
- }
-
- di_bh = NULL;
- ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
- 0, journal);
- if (ret < 0) {
+ ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
+ if (ret) {
mlog_errno(ret);
goto out;
}
di = (struct ocfs2_dinode *) di_bh->b_data;
+ osb->slot_recovery_generations[slot] =
+ ocfs2_get_recovery_generation(di);
+
if (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL)
- ret = -EROFS;
+ journal_dirty = 1;
brelse(di_bh);
- if (ret)
- break;
+ di_bh = NULL;
}
out:
- if (journal)
- iput(journal);
-
+ if (journal_dirty)
+ ret = -EROFS;
return ret;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2f3a6acdac4..7f8cde94abf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,7 @@
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
-#include <linux/jbd.h>
+#include <linux/jbd2.h>
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
@@ -37,7 +37,17 @@ enum ocfs2_journal_state {
struct ocfs2_super;
struct ocfs2_dinode;
-struct ocfs2_journal_handle;
+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+ unsigned int rm_used;
+ unsigned int *rm_entries;
+};
+
struct ocfs2_journal {
enum ocfs2_journal_state j_state; /* Journals current state */
@@ -57,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
+ spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
- spinlock_t j_lock;
+ /* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
@@ -80,100 +91,82 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
return old_id;
}
-static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
- struct inode *inode)
+static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
+ struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
- OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+ ci->ci_last_trans = journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Used to figure out whether it's safe to drop a metadata lock on an
- * inode. Returns true if all the inodes changes have been
+ * cached object. Returns true if all the object's changes have been
* checkpointed to disk. You should be holding the spinlock on the
* metadata lock while calling this to be sure that nobody can take
* the lock and put it on another transaction. */
-static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
{
int ret;
- struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+ struct ocfs2_journal *journal =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
spin_lock(&trans_inc_lock);
- ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+ ret = time_after(journal->j_trans_id, ci->ci_last_trans);
spin_unlock(&trans_inc_lock);
return ret;
}
-/* convenience function to check if an inode is still new (has never
- * hit disk) Will do you a favor and set created_trans = 0 when you've
- * been checkpointed. returns '1' if the inode is still new. */
-static inline int ocfs2_inode_is_new(struct inode *inode)
+/* convenience function to check if an object backed by struct
+ * ocfs2_caching_info is still new (has never hit disk) Will do you a
+ * favor and set created_trans = 0 when you've
+ * been checkpointed. returns '1' if the ci is still new. */
+static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
{
int ret;
+ struct ocfs2_journal *journal =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
- /* System files are never "new" as they're written out by
- * mkfs. This helps us early during mount, before we have the
- * journal open and j_trans_id could be junk. */
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
- return 0;
spin_lock(&trans_inc_lock);
- ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
- OCFS2_I(inode)->ip_created_trans));
+ ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
if (!ret)
- OCFS2_I(inode)->ip_created_trans = 0;
+ ci->ci_created_trans = 0;
spin_unlock(&trans_inc_lock);
return ret;
}
-static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
- struct inode *inode)
+/* Wrapper for inodes so we can check system files */
+static inline int ocfs2_inode_is_new(struct inode *inode)
{
- spin_lock(&trans_inc_lock);
- OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
- spin_unlock(&trans_inc_lock);
-}
-
-extern kmem_cache_t *ocfs2_lock_cache;
-
-struct ocfs2_journal_lock {
- struct inode *jl_inode;
- struct list_head jl_lock_list;
-};
-
-struct ocfs2_journal_handle {
- handle_t *k_handle; /* kernel handle. */
- struct ocfs2_journal *journal;
- u32 flags; /* see flags below. */
- int max_buffs; /* Buffs reserved by this handle */
-
- /* The following two fields are for ocfs2_handle_add_lock */
- int num_locks;
- struct list_head locks; /* A bunch of locks to
- * release on commit. This
- * should be a list_head */
-
- struct list_head inode_list;
-};
+ /* System files are never "new" as they're written out by
+ * mkfs. This helps us early during mount, before we have the
+ * journal open and j_trans_id could be junk. */
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
+ return 0;
-#define OCFS2_HANDLE_STARTED 1
-/* should we sync-commit this handle? */
-#define OCFS2_HANDLE_SYNC 2
-static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
-{
- return handle->flags & OCFS2_HANDLE_STARTED;
+ return ocfs2_ci_is_new(INODE_CACHE(inode));
}
-static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
+static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
+ struct ocfs2_caching_info *ci)
{
- if (sync)
- handle->flags |= OCFS2_HANDLE_SYNC;
- else
- handle->flags &= ~OCFS2_HANDLE_SYNC;
+ spin_lock(&trans_inc_lock);
+ ci->ci_created_trans = osb->journal->j_trans_id;
+ spin_unlock(&trans_inc_lock);
}
/* Exported only for the journal struct init code in super.c. Do not call. */
-void ocfs2_complete_recovery(void *data);
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
+void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
@@ -196,16 +189,17 @@ int ocfs2_journal_init(struct ocfs2_journal *journal,
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
-int ocfs2_journal_load(struct ocfs2_journal *journal);
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+ int replayed);
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
- atomic_set(&osb->needs_checkpoint, 1);
wake_up(&osb->checkpoint_event);
}
@@ -213,17 +207,20 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- if (!ocfs2_inode_fully_checkpointed(inode)) {
+ if (ocfs2_mount_local(osb))
+ return;
+
+ if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
/* WARNING: This only kicks off a single
* checkpoint. If someone races you and adds more
* metadata to the journal, you won't know, and will
- * wind up waiting *alot* longer than necessary. Right
+ * wind up waiting *a lot* longer than necessary. Right
* now we only use this in clear_inode so that's
* OK. */
ocfs2_start_checkpoint(osb);
wait_event(osb->journal->j_checkpointed,
- ocfs2_inode_fully_checkpointed(inode));
+ ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
}
}
@@ -231,42 +228,47 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* Transaction Handling:
* Manage the lifetime of a transaction handle.
*
- * ocfs2_alloc_handle - Only allocate a handle so we can start putting
- * cluster locks on it. To actually change blocks,
- * call ocfs2_start_trans with the handle returned
- * from this function. You may call ocfs2_commit_trans
- * at any time in the lifetime of a handle.
* ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
* the number of blocks that will be changed during
* this handle.
- * ocfs2_commit_trans - Complete a handle.
+ * ocfs2_commit_trans - Complete a handle. It might return -EIO if
+ * the journal was aborted. The majority of paths don't
+ * check the return value as an error there comes too
+ * late to do anything (and will be picked up in a
+ * later transaction).
* ocfs2_extend_trans - Extend a handle by nblocks credits. This may
* commit the handle to disk in the process, but will
* not release any locks taken during the transaction.
- * ocfs2_journal_access - Notify the handle that we want to journal this
+ * ocfs2_journal_access* - Notify the handle that we want to journal this
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
+ * Always call the specific flavor of
+ * ocfs2_journal_access_*() unless you intend to
+ * manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
- * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- * the current handle commits.
- * ocfs2_handle_add_lock - Sometimes we need to delay lock release
- * until after a transaction has been completed. Use
- * ocfs2_handle_add_lock to indicate that a lock needs
- * to be released at the end of that handle. Locks
- * will be released in the order that they are added.
- * ocfs2_handle_add_inode - Add a locked inode to a transaction.
+ * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
+ * the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
* perfectly legal to go through an entire transaction without having
* dirtied any buffers. */
-struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
-struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int max_buffs);
-void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
-int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
- int nblocks);
+int ocfs2_commit_trans(struct ocfs2_super *osb,
+ handle_t *handle);
+int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_allocate_extend_trans(handle_t *handle,
+ int thresh);
+
+/*
+ * Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * fallocate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go.
+ */
+#define OCFS2_MAX_TRANS_DATA 64U
/*
* Create access is for when we get a newly created buffer and we're
@@ -283,10 +285,38 @@ int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
#define OCFS2_JOURNAL_ACCESS_WRITE 1
#define OCFS2_JOURNAL_ACCESS_UNDO 2
-int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct buffer_head *bh,
- int type);
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+
/*
* A word about the journal_access/journal_dirty "dance". It is
* entirely legal to journal_access a buffer more than once (as long
@@ -306,18 +336,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
* <modify the bh>
* ocfs2_journal_dirty(handle, bh);
*/
-int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
- struct buffer_head *bh);
-int ocfs2_journal_dirty_data(handle_t *handle,
- struct buffer_head *bh);
-int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
- struct inode *inode);
-/*
- * Use this to protect from other processes reading buffer state while
- * it's in flight.
- */
-void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
- struct inode *inode);
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
/*
* Credit Macros:
@@ -331,10 +350,58 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+ int credits = 0;
+
+ if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+ credits += OCFS2_QWRITE_CREDITS;
+ if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+ credits += OCFS2_QWRITE_CREDITS;
+ return credits;
+}
+
+/* group extend. inode update and last group update. */
+#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* group add. inode update and the new group update. */
+#define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
/* get one bit out of a suballocator: dinode + group descriptor +
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+ return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
+
/* dinode + group descriptor update. We don't relink on free yet. */
#define OCFS2_SUBALLOC_FREE (2)
@@ -342,14 +409,38 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
+ OCFS2_TRUNCATE_LOG_UPDATE)
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+ return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
+
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
-/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
- + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+ /* 1 block for index, 2 allocs (data, metadata), 1 clusters
+ * worth of blocks for initial extent. */
+ return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+ ocfs2_clusters_to_blocks(sb, 1);
+}
+
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+ int xattr_credits)
+{
+ int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+
+ if (is_dir)
+ dir_credits += ocfs2_add_dir_index_credits(sb);
+
+ return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
+ ocfs2_quota_trans_credits(sb);
+}
/* local alloc metadata change + main bitmap updates */
#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
@@ -359,29 +450,83 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
* for the dinode, one for the new block. */
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + dir entry block */
-#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+ ocfs2_quota_trans_credits(sb);
+}
/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
- * dir inode link */
-#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
- + OCFS2_LINK_CREDITS)
+ * dir inode link + dir inode index leaf + dir index root */
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+ /* The quota update from ocfs2_link_credits is unused here... */
+ return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
+}
/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor */
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+ * inode alloc group descriptor + orphan dir index root +
+ * orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
- * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
- + OCFS2_UNLINK_CREDITS)
+ * directory + target unlink + 3 x dir index leaves */
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+ return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
+}
+
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+ + OCFS2_INODE_UPDATE_CREDITS \
+ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+ OCFS2_SUBALLOC_FREE)
+
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+ int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+
+ credits += ocfs2_clusters_to_blocks(sb, 1);
+ credits += ocfs2_quota_trans_credits(sb);
+
+ return credits;
+}
+
+/* inode update, new refcount block and its allocation credits. */
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+ + OCFS2_SUBALLOC_ALLOC)
+
+/* inode and the refcount block update. */
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * inode and the refcount block update.
+ * It doesn't include the credits for sub alloc change.
+ * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
+ */
+#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_dinode *fe,
- u32 bits_wanted)
+ struct ocfs2_extent_list *root_el)
{
- int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+ int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
/* bitmap dinode, group desc. + relinked group. */
bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -392,27 +537,28 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
* however many metadata chunks needed * a remaining suballoc
* alloc. */
sysfile_bitmap_blocks = 1 +
- (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+ (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
/* this does not include *new* metadata blocks, which are
- * accounted for in sysfile_bitmap_blocks. fe +
+ * accounted for in sysfile_bitmap_blocks. root_el +
* prev. last_eb_blk + blocks along edge of tree.
* calc_symlink_credits passes because we just need 1
* credit for the dinode there. */
- dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+ extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
- return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+ return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+ ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
{
- int blocks = OCFS2_MKNOD_CREDITS;
+ int blocks = ocfs2_mknod_credits(sb, 0, 0);
/* links can be longer than one block so we may update many
* within our single allocated extent. */
blocks += ocfs2_clusters_to_blocks(sb, 1);
- return blocks;
+ return blocks + ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -426,6 +572,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
return blocks;
}
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list. The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits(). They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+ return ocfs2_extent_recs_per_gd(sb);
+}
+
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
unsigned int clusters_to_del,
struct ocfs2_dinode *fe,
@@ -443,13 +601,40 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
/* We may be deleting metadata blocks, so metadata alloc dinode +
one desc. block for each possible delete. */
if (tree_depth && next_free == 1 &&
- le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+ ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
credits += 1 + tree_depth;
/* update to the truncate log. */
credits += OCFS2_TRUNCATE_LOG_UPDATE;
+ credits += ocfs2_quota_trans_credits(sb);
+
return credits;
}
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(
+ OCFS2_SB(inode->i_sb)->journal->j_journal,
+ &OCFS2_I(inode)->ip_jinode,
+ new_size);
+}
+
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 1f17a4d0828..04401345562 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,12 +29,12 @@
#include <linux/highmem.h>
#include <linux/bitops.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
@@ -42,73 +42,243 @@
#include "suballoc.h"
#include "super.h"
#include "sysfile.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc,
- u32 numbits);
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv);
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct ocfs2_dinode *alloc,
struct inode *main_bm_inode,
struct buffer_head *main_bm_bh);
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac,
struct inode **bitmap_inode,
struct buffer_head **bitmap_bh);
static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct ocfs2_alloc_context *ac);
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
/*
- * Determine how large our local alloc window should be, in bits.
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ * group descriptors for other allocations (such as block groups,
+ * etc). Picking default sizes which are a multiple of 4 could help
+ * - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ * file systems. This can easily be taken care of by limiting our
+ * default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ * particular are limited to less than 128 and 256 megabytes respectively.
*
- * These values (and the behavior in ocfs2_alloc_should_use_local) have
- * been chosen so that most allocations, including new block groups go
- * through local alloc.
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K group: 126M la: 121M
+ * csize: 8K group: 252M la: 243M
+ * csize: 16K group: 504M la: 486M
+ * csize: 32K group: 1008M la: 972M
+ * csize: 64K group: 2016M la: 1944M
+ * csize: 128K group: 4032M la: 3888M
+ * csize: 256K group: 8064M la: 7776M
+ * csize: 512K group: 16128M la: 15552M
+ * csize: 1024K group: 32256M la: 31104M
*/
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT 8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+ unsigned int la_mb;
+ unsigned int gd_mb;
+ unsigned int la_max_mb;
+ unsigned int megs_per_slot;
+ struct super_block *sb = osb->sb;
+
+ gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+ /*
+ * This takes care of files systems with very small group
+ * descriptors - 512 byte blocksize at cluster sizes lower
+ * than 16K and also 1k blocksize with 4k cluster size.
+ */
+ if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+ || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+ return OCFS2_LA_OLD_DEFAULT;
+
+ /*
+ * Leave enough room for some block groups and make the final
+ * value we work from a multiple of 4.
+ */
+ gd_mb -= 16;
+ gd_mb &= 0xFFFFFFFB;
+
+ la_mb = gd_mb;
+
+ /*
+ * Keep window sizes down to a reasonable default
+ */
+ if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+ /*
+ * Some clustersize / blocksize combinations will have
+ * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+ * default size, but get poor distribution when
+ * limited to exactly 256 megabytes.
+ *
+ * As an example, 16K clustersize at 4K blocksize
+ * gives us a cluster group size of 504M. Paring the
+ * local alloc size down to 256 however, would give us
+ * only one window and around 200MB left in the
+ * cluster group. Instead, find the first size below
+ * 256 which would give us an even distribution.
+ *
+ * Larger cluster group sizes actually work out pretty
+ * well when pared to 256, so we don't have to do this
+ * for any group that fits more than two
+ * OCFS2_LA_MAX_DEFAULT_MB windows.
+ */
+ if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+ la_mb = 256;
+ else {
+ unsigned int gd_mult = gd_mb;
+
+ while (gd_mult > 256)
+ gd_mult = gd_mult >> 1;
+
+ la_mb = gd_mult;
+ }
+ }
+
+ megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+ megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+ /* Too many nodes, too few disk clusters. */
+ if (megs_per_slot < la_mb)
+ la_mb = megs_per_slot;
+
+ /* We can't store more bits than we can in a block. */
+ la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ ocfs2_local_alloc_size(sb) * 8);
+ if (la_mb > la_max_mb)
+ la_mb = la_max_mb;
+
+ return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+ struct super_block *sb = osb->sb;
+ unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+ unsigned int la_max_mb;
+
+ la_max_mb = ocfs2_clusters_to_megabytes(sb,
+ ocfs2_local_alloc_size(sb) * 8);
+
+ trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
+
+ if (requested_mb == -1) {
+ /* No user request - use defaults */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_default_mb);
+ } else if (requested_mb > la_max_mb) {
+ /* Request is too big, we give the maximum available */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_max_mb);
+ } else {
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, requested_mb);
+ }
+
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
+{
+ return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+ osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+ unsigned int num_clusters)
{
- BUG_ON(osb->s_clustersize_bits < 12);
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+ osb->local_alloc_state == OCFS2_LA_THROTTLED)
+ if (num_clusters >= osb->local_alloc_default_bits) {
+ cancel_delayed_work(&osb->la_enable_wq);
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ }
+ spin_unlock(&osb->osb_lock);
+}
- return 2048 >> (osb->s_clustersize_bits - 12);
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+ struct ocfs2_super *osb =
+ container_of(work, struct ocfs2_super,
+ la_enable_wq.work);
+ spin_lock(&osb->osb_lock);
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ spin_unlock(&osb->osb_lock);
}
/*
* Tell us whether a given allocation should use the local alloc
* file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
*/
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{
- int la_bits = ocfs2_local_alloc_window_bits(osb);
+ int ret = 0;
+ int la_bits;
+
+ spin_lock(&osb->osb_lock);
+ la_bits = osb->local_alloc_bits;
- if (osb->local_alloc_state != OCFS2_LA_ENABLED)
- return 0;
+ if (!ocfs2_la_state_enabled(osb))
+ goto bail;
/* la_bits should be at least twice the size (in clusters) of
* a new block group. We want to be sure block group
* allocations go through the local alloc, so allow an
* allocation to take up to half the bitmap. */
if (bits > (la_bits / 2))
- return 0;
+ goto bail;
- return 1;
+ ret = 1;
+bail:
+ trace_ocfs2_alloc_should_use_local(
+ (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
+ spin_unlock(&osb->osb_lock);
+ return ret;
}
int ocfs2_load_local_alloc(struct ocfs2_super *osb)
@@ -120,7 +290,17 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
+ if (osb->local_alloc_bits == 0)
+ goto bail;
+
+ if (osb->local_alloc_bits >= osb->bitmap_cpg) {
+ mlog(ML_NOTICE, "Requested local alloc window %d is larger "
+ "than max possible %u. Using defaults.\n",
+ osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+ osb->local_alloc_bits =
+ ocfs2_megabytes_to_clusters(osb->sb,
+ ocfs2_la_default_mb(osb));
+ }
/* read the alloc off disk */
inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
@@ -131,8 +311,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
- &alloc_bh, 0, inode);
+ status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -177,12 +357,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
- if (alloc_bh)
- brelse(alloc_bh);
+ brelse(alloc_bh);
if (inode)
iput(inode);
- mlog_exit(status);
+ trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
+
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -196,7 +378,7 @@ bail:
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
{
int status;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle;
struct inode *local_alloc_inode = NULL;
struct buffer_head *bh = NULL;
struct buffer_head *main_bm_bh = NULL;
@@ -204,10 +386,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_dinode *alloc = NULL;
- mlog_entry_void();
+ cancel_delayed_work(&osb->la_enable_wq);
+ flush_workqueue(ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
- goto bail;
+ goto out;
local_alloc_inode =
ocfs2_get_system_file_inode(osb,
@@ -216,17 +399,12 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
if (!local_alloc_inode) {
status = -ENOENT;
mlog_errno(status);
- goto bail;
+ goto out;
}
osb->local_alloc_state = OCFS2_LA_DISABLED;
- handle = ocfs2_alloc_handle(osb);
- if (!handle) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_resmap_uninit(&osb->osb_la_resmap);
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
@@ -234,48 +412,44 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
if (!main_bm_inode) {
status = -EINVAL;
mlog_errno(status);
- goto bail;
+ goto out;
}
- ocfs2_handle_add_inode(handle, main_bm_inode);
- status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_mutex;
}
/* WINDOW_MOVE_CREDITS is a bit heavy... */
- handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
handle = NULL;
- goto bail;
+ goto out_unlock;
}
bh = osb->local_alloc_bh;
alloc = (struct ocfs2_dinode *) bh->b_data;
- alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
+ alloc_copy = kmalloc(bh->b_size, GFP_NOFS);
if (!alloc_copy) {
status = -ENOMEM;
- goto bail;
+ goto out_commit;
}
memcpy(alloc_copy, alloc, bh->b_size);
- status = ocfs2_journal_access(handle, local_alloc_inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
+ bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_commit;
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
osb->local_alloc_bh = NULL;
@@ -286,23 +460,23 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
if (status < 0)
mlog_errno(status);
-bail:
- if (handle)
- ocfs2_commit_trans(handle);
+out_commit:
+ ocfs2_commit_trans(osb, handle);
- if (main_bm_bh)
- brelse(main_bm_bh);
+out_unlock:
+ brelse(main_bm_bh);
- if (main_bm_inode)
- iput(main_bm_inode);
+ ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
if (local_alloc_inode)
iput(local_alloc_inode);
- if (alloc_copy)
- kfree(alloc_copy);
-
- mlog_exit_void();
+ kfree(alloc_copy);
}
/*
@@ -321,7 +495,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
struct inode *inode = NULL;
struct ocfs2_dinode *alloc;
- mlog_entry("(slot_num = %d)\n", slot_num);
+ trace_ocfs2_begin_local_alloc_recovery(slot_num);
*alloc_copy = NULL;
@@ -336,8 +510,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
- &alloc_bh, 0, inode);
+ status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -353,25 +527,26 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
ocfs2_clear_local_alloc(alloc);
- status = ocfs2_write_block(osb, alloc_bh, inode);
+ ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
+ status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
if (status < 0)
mlog_errno(status);
bail:
- if ((status < 0) && (*alloc_copy)) {
+ if (status < 0) {
kfree(*alloc_copy);
*alloc_copy = NULL;
}
- if (alloc_bh)
- brelse(alloc_bh);
+ brelse(alloc_bh);
if (inode) {
mutex_unlock(&inode->i_mutex);
iput(inode);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -385,18 +560,9 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc)
{
int status;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle;
struct buffer_head *main_bm_bh = NULL;
- struct inode *main_bm_inode = NULL;
-
- mlog_entry_void();
-
- handle = ocfs2_alloc_handle(osb);
- if (!handle) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
+ struct inode *main_bm_inode;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
@@ -404,55 +570,61 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
if (!main_bm_inode) {
status = -EINVAL;
mlog_errno(status);
- goto bail;
+ goto out;
}
- ocfs2_handle_add_inode(handle, main_bm_inode);
- status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_mutex;
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status);
- goto bail;
+ goto out_unlock;
}
/* we want the bitmap change to be recorded on disk asap */
- ocfs2_handle_set_sync(handle, 1);
+ handle->h_sync = 1;
status = ocfs2_sync_local_to_main(osb, handle, alloc,
main_bm_inode, main_bm_bh);
if (status < 0)
mlog_errno(status);
-bail:
- if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
- if (main_bm_bh)
- brelse(main_bm_bh);
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 1);
- if (main_bm_inode)
- iput(main_bm_inode);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+
+ brelse(main_bm_bh);
- mlog_exit(status);
+ iput(main_bm_inode);
+
+out:
+ if (!status)
+ ocfs2_init_steal_slots(osb);
+ if (status)
+ mlog_errno(status);
return status;
}
/*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
* local alloc. You lose them when you drop i_mutex.
*
* We will add ourselves to the transaction passed in, but may start
* our own in order to shift windows.
*/
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *passed_handle,
u32 bits_wanted,
struct ocfs2_alloc_context *ac)
{
@@ -461,11 +633,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
struct inode *local_alloc_inode;
unsigned int free_bits;
- mlog_entry_void();
-
- BUG_ON(!passed_handle);
BUG_ON(!ac);
- BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
local_alloc_inode =
ocfs2_get_system_file_inode(osb,
@@ -476,21 +644,25 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
- if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
- status = -ENOSPC;
- goto bail;
- }
+ mutex_lock(&local_alloc_inode->i_mutex);
- if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
- mlog(0, "Asking for more than my max window size!\n");
+ /*
+ * We must double check state and allocator bits because
+ * another process may have changed them while holding i_mutex.
+ */
+ spin_lock(&osb->osb_lock);
+ if (!ocfs2_la_state_enabled(osb) ||
+ (bits_wanted > osb->local_alloc_bits)) {
+ spin_unlock(&osb->osb_lock);
status = -ENOSPC;
goto bail;
}
+ spin_unlock(&osb->osb_lock);
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
ocfs2_error(osb->sb, "local alloc inode %llu says it has "
@@ -501,6 +673,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
status = -EIO;
goto bail;
}
+#endif
free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
le32_to_cpu(alloc->id1.bitmap1.i_used);
@@ -513,44 +686,66 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
+
+ /*
+ * Under certain conditions, the window slide code
+ * might have reduced the number of bits available or
+ * disabled the the local alloc entirely. Re-check
+ * here and return -ENOSPC if necessary.
+ */
+ status = -ENOSPC;
+ if (!ocfs2_la_state_enabled(osb))
+ goto bail;
+
+ free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(alloc->id1.bitmap1.i_used);
+ if (bits_wanted > free_bits)
+ goto bail;
}
- ac->ac_inode = igrab(local_alloc_inode);
+ ac->ac_inode = local_alloc_inode;
+ /* We should never use localalloc from another slot */
+ ac->ac_alloc_slot = osb->slot_num;
+ ac->ac_which = OCFS2_AC_USE_LOCAL;
get_bh(osb->local_alloc_bh);
ac->ac_bh = osb->local_alloc_bh;
- ac->ac_which = OCFS2_AC_USE_LOCAL;
status = 0;
bail:
- if (local_alloc_inode)
+ if (status < 0 && local_alloc_inode) {
+ mutex_unlock(&local_alloc_inode->i_mutex);
iput(local_alloc_inode);
+ }
+
+ trace_ocfs2_reserve_local_alloc_bits(
+ (unsigned long long)ac->ac_max_block,
+ bits_wanted, osb->slot_num, status);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct ocfs2_alloc_context *ac,
- u32 min_bits,
+ u32 bits_wanted,
u32 *bit_off,
u32 *num_bits)
{
int status, start;
struct inode *local_alloc_inode;
- u32 bits_wanted;
void *bitmap;
struct ocfs2_dinode *alloc;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
- bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
local_alloc_inode = ac->ac_inode;
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+ start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+ ac->ac_resv);
if (start == -1) {
/* TODO: Shouldn't we just BUG here? */
status = -ENOSPC;
@@ -560,68 +755,125 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
bitmap = la->la_bitmap;
*bit_off = le32_to_cpu(la->la_bm_off) + start;
- /* local alloc is always contiguous by nature -- we never
- * delete bits from it! */
*num_bits = bits_wanted;
- status = ocfs2_journal_access(handle, local_alloc_inode,
- osb->local_alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+ bits_wanted);
+
while(bits_wanted--)
ocfs2_set_bit(start++, bitmap);
- alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
- le32_to_cpu(alloc->id1.bitmap1.i_used));
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+bail:
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits)
+{
+ int status, start;
+ u32 clear_bits;
+ struct inode *local_alloc_inode;
+ void *bitmap;
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+ local_alloc_inode = ac->ac_inode;
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ bitmap = la->la_bitmap;
+ start = bit_off - le32_to_cpu(la->la_bm_off);
+ clear_bits = num_bits;
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = 0;
+ while (clear_bits--)
+ ocfs2_clear_bit(start++, bitmap);
+
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
bail:
- mlog_exit(status);
return status;
}
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
- int i;
- u8 *buffer;
- u32 count = 0;
+ u32 count;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
- mlog_entry_void();
+ count = memweight(la->la_bitmap, le16_to_cpu(la->la_size));
- buffer = la->la_bitmap;
- for (i = 0; i < le16_to_cpu(la->la_size); i++)
- count += hweight8(buffer[i]);
-
- mlog_exit(count);
+ trace_ocfs2_local_alloc_count_bits(count);
return count;
}
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
- struct ocfs2_dinode *alloc,
- u32 numbits)
+ struct ocfs2_dinode *alloc,
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv)
{
int numfound, bitoff, left, startoff, lastzero;
+ int local_resv = 0;
+ struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
-
- mlog_entry("(numbits wanted = %u)\n", numbits);
+ struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
if (!alloc->id1.bitmap1.i_total) {
- mlog(0, "No bits in my window!\n");
bitoff = -1;
goto bail;
}
+ if (!resv) {
+ local_resv = 1;
+ ocfs2_resv_init_once(&r);
+ ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+ resv = &r;
+ }
+
+ numfound = *numbits;
+ if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+ if (numfound < *numbits)
+ *numbits = numfound;
+ goto bail;
+ }
+
+ /*
+ * Code error. While reservations are enabled, local
+ * allocation should _always_ go through them.
+ */
+ BUG_ON(osb->osb_resv_level != 0);
+
+ /*
+ * Reservations are disabled. Handle this the old way.
+ */
+
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
@@ -647,22 +899,27 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
startoff = bitoff+1;
}
/* we got everything we needed */
- if (numfound == numbits) {
+ if (numfound == *numbits) {
/* mlog(0, "Found it all!\n"); */
break;
}
}
- mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
- numfound);
+ trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
- if (numfound == numbits)
+ if (numfound == *numbits)
bitoff = startoff - numfound;
else
bitoff = -1;
bail:
- mlog_exit(bitoff);
+ if (local_resv)
+ ocfs2_resv_discard(resmap, resv);
+
+ trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ bitoff, numfound);
+
return bitoff;
}
@@ -670,15 +927,12 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
{
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
int i;
- mlog_entry_void();
alloc->id1.bitmap1.i_total = 0;
alloc->id1.bitmap1.i_used = 0;
la->la_bm_off = 0;
for(i = 0; i < le16_to_cpu(la->la_size); i++)
la->la_bitmap[i] = 0;
-
- mlog_exit_void();
}
#if 0
@@ -707,7 +961,7 @@ static void ocfs2_verify_zero_bits(unsigned long *bitmap,
* passed is used for caching.
*/
static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct ocfs2_dinode *alloc,
struct inode *main_bm_inode,
struct buffer_head *main_bm_bh)
@@ -719,19 +973,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
void *bitmap;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
- mlog_entry("total = %u, COUNT = %u, used = %u\n",
- le32_to_cpu(alloc->id1.bitmap1.i_total),
- ocfs2_local_alloc_count_bits(alloc),
- le32_to_cpu(alloc->id1.bitmap1.i_used));
+ trace_ocfs2_sync_local_to_main(
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ le32_to_cpu(alloc->id1.bitmap1.i_used));
if (!alloc->id1.bitmap1.i_total) {
- mlog(0, "nothing to sync!\n");
goto bail;
}
if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
le32_to_cpu(alloc->id1.bitmap1.i_total)) {
- mlog(0, "all bits were taken!\n");
goto bail;
}
@@ -753,14 +1004,15 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
ocfs2_clusters_to_blocks(osb->sb,
start - count);
- mlog(0, "freeing %u bits starting at local alloc bit "
- "%u (la_start_blk = %llu, blkno = %llu)\n",
+ trace_ocfs2_sync_local_to_main_free(
count, start - count,
(unsigned long long)la_start_blk,
(unsigned long long)blkno);
- status = ocfs2_free_clusters(handle, main_bm_inode,
- main_bm_bh, blkno, count);
+ status = ocfs2_release_clusters(handle,
+ main_bm_inode,
+ main_bm_bh, blkno,
+ count);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -773,32 +1025,118 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+enum ocfs2_la_event {
+ OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */
+ OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has
+ * enough bits theoretically
+ * free, but a contiguous
+ * allocation could not be
+ * found. */
+ OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have
+ * enough bits free to satisfy
+ * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+ enum ocfs2_la_event event)
+{
+ unsigned int bits;
+ int state;
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+ WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+ goto out_unlock;
+ }
+
+ /*
+ * ENOSPC and fragmentation are treated similarly for now.
+ */
+ if (event == OCFS2_LA_EVENT_ENOSPC ||
+ event == OCFS2_LA_EVENT_FRAGMENTED) {
+ /*
+ * We ran out of contiguous space in the primary
+ * bitmap. Drastically reduce the number of bits used
+ * by local alloc until we have to disable it.
+ */
+ bits = osb->local_alloc_bits >> 1;
+ if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+ /*
+ * By setting state to THROTTLED, we'll keep
+ * the number of local alloc bits used down
+ * until an event occurs which would give us
+ * reason to assume the bitmap situation might
+ * have changed.
+ */
+ osb->local_alloc_state = OCFS2_LA_THROTTLED;
+ osb->local_alloc_bits = bits;
+ } else {
+ osb->local_alloc_state = OCFS2_LA_DISABLED;
+ }
+ queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ OCFS2_LA_ENABLE_INTERVAL);
+ goto out_unlock;
+ }
+
+ /*
+ * Don't increase the size of the local alloc window until we
+ * know we might be able to fulfill the request. Otherwise, we
+ * risk bouncing around the global bitmap during periods of
+ * low space.
+ */
+ if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+ state = osb->local_alloc_state;
+ spin_unlock(&osb->osb_lock);
+
+ return state;
+}
+
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac,
struct inode **bitmap_inode,
struct buffer_head **bitmap_bh)
{
int status;
- *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+ *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- (*ac)->ac_handle = handle;
- (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
-
+retry_enospc:
+ (*ac)->ac_bits_wanted = osb->local_alloc_bits;
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ if (status == -ENOSPC) {
+ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+ OCFS2_LA_DISABLED)
+ goto bail;
+
+ ocfs2_free_ac_resource(*ac);
+ memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+ goto retry_enospc;
+ }
if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ mlog_errno(status);
goto bail;
}
@@ -813,7 +1151,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -821,7 +1160,7 @@ bail:
* pass it the bitmap lock in lock_bh if you have it.
*/
static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct ocfs2_alloc_context *ac)
{
int status = 0;
@@ -829,17 +1168,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc = NULL;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
-
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- if (alloc->id1.bitmap1.i_total)
- mlog(0, "asking me to alloc a new window over a non-empty "
- "one\n");
-
- mlog(0, "Allocating %u clusters for a new window.\n",
- ocfs2_local_alloc_window_bits(osb));
+ trace_ocfs2_local_alloc_new_window(
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ osb->local_alloc_bits);
/* Instruct the allocation code to try the most recently used
* cluster group. We'll re-record the group used this pass
@@ -849,9 +1183,37 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
- status = ocfs2_claim_clusters(osb, handle, ac,
- ocfs2_local_alloc_window_bits(osb),
+ status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count);
+ if (status == -ENOSPC) {
+retry_enospc:
+ /*
+ * Note: We could also try syncing the journal here to
+ * allow use of any free bits which the current
+ * transaction can't give us access to. --Mark
+ */
+ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+ OCFS2_LA_DISABLED)
+ goto bail;
+
+ ac->ac_bits_wanted = osb->local_alloc_bits;
+ status = ocfs2_claim_clusters(handle, ac,
+ osb->local_alloc_bits,
+ &cluster_off,
+ &cluster_count);
+ if (status == -ENOSPC)
+ goto retry_enospc;
+ /*
+ * We only shrunk the *minimum* number of in our
+ * request - it's entirely possible that the allocator
+ * might give us more than we asked for.
+ */
+ if (status == 0) {
+ spin_lock(&osb->osb_lock);
+ osb->local_alloc_bits = cluster_count;
+ spin_unlock(&osb->osb_lock);
+ }
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -870,13 +1232,16 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
le16_to_cpu(la->la_size));
- mlog(0, "New window allocated:\n");
- mlog(0, "window la_bm_off = %u\n",
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
- mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+ ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+ OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
+ trace_ocfs2_local_alloc_new_window_result(
+ OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+ le32_to_cpu(alloc->id1.bitmap1.i_total));
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -888,23 +1253,15 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
int status = 0;
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode = NULL;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
struct ocfs2_dinode *alloc;
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_alloc_context *ac = NULL;
- mlog_entry_void();
-
- handle = ocfs2_alloc_handle(osb);
- if (!handle) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
/* This will lock the main bitmap for us. */
status = ocfs2_local_alloc_reserve_for_window(osb,
- handle,
&ac,
&main_bm_inode,
&main_bm_bh);
@@ -914,7 +1271,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
goto bail;
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -929,7 +1286,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
* local alloc shutdown won't try to double free main bitmap
* bits. Make a copy so the sync function knows which bits to
* free. */
- alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
+ alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS);
if (!alloc_copy) {
status = -ENOMEM;
mlog_errno(status);
@@ -937,21 +1294,17 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
- status = ocfs2_journal_access(handle, local_alloc_inode,
- osb->local_alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
main_bm_inode, main_bm_bh);
@@ -969,24 +1322,22 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
atomic_inc(&osb->alloc_stats.moves);
- status = 0;
bail:
if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
if (main_bm_inode)
iput(main_bm_inode);
- if (alloc_copy)
- kfree(alloc_copy);
+ kfree(alloc_copy);
if (ac)
ocfs2_free_alloc_context(ac);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 30f88ce14e4..44a7d1fb2de 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
@@ -42,15 +45,24 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
struct ocfs2_alloc_context;
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *passed_handle,
u32 bits_wanted,
struct ocfs2_alloc_context *ac);
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct ocfs2_alloc_context *ac,
- u32 min_bits,
+ u32 bits_wanted,
u32 *bit_off,
u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits);
+
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+ unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
new file mode 100644
index 00000000000..6b6d092b099
--- /dev/null
+++ b/fs/ocfs2/locks.c
@@ -0,0 +1,141 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * locks.c
+ *
+ * Userspace file locking support
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "locks.h"
+
+static int ocfs2_do_flock(struct file *file, struct inode *inode,
+ int cmd, struct file_lock *fl)
+{
+ int ret = 0, level = 0, trylock = 0;
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_lock_res *lockres = &fp->fp_flock;
+
+ if (fl->fl_type == F_WRLCK)
+ level = 1;
+ if (!IS_SETLKW(cmd))
+ trylock = 1;
+
+ mutex_lock(&fp->fp_mutex);
+
+ if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+ lockres->l_level > LKM_NLMODE) {
+ int old_level = 0;
+
+ if (lockres->l_level == LKM_EXMODE)
+ old_level = 1;
+
+ if (level == old_level)
+ goto out;
+
+ /*
+ * Converting an existing lock is not guaranteed to be
+ * atomic, so we can get away with simply unlocking
+ * here and allowing the lock code to try at the new
+ * level.
+ */
+
+ flock_lock_file_wait(file,
+ &(struct file_lock){.fl_type = F_UNLCK});
+
+ ocfs2_file_unlock(file);
+ }
+
+ ret = ocfs2_file_lock(file, level, trylock);
+ if (ret) {
+ if (ret == -EAGAIN && trylock)
+ ret = -EWOULDBLOCK;
+ else
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = flock_lock_file_wait(file, fl);
+ if (ret)
+ ocfs2_file_unlock(file);
+
+out:
+ mutex_unlock(&fp->fp_mutex);
+
+ return ret;
+}
+
+static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
+{
+ int ret;
+ struct ocfs2_file_private *fp = file->private_data;
+
+ mutex_lock(&fp->fp_mutex);
+ ocfs2_file_unlock(file);
+ ret = flock_lock_file_wait(file, fl);
+ mutex_unlock(&fp->fp_mutex);
+
+ return ret;
+}
+
+/*
+ * Overall flow of ocfs2_flock() was influenced by gfs2_flock().
+ */
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ if (__mandatory_lock(inode))
+ return -ENOLCK;
+
+ if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+ ocfs2_mount_local(osb))
+ return flock_lock_file_wait(file, fl);
+
+ if (fl->fl_type == F_UNLCK)
+ return ocfs2_do_funlock(file, cmd, fl);
+ else
+ return ocfs2_do_flock(file, inode, cmd, fl);
+}
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/locks.h
index d7395cb91d2..496d488b271 100644
--- a/fs/ocfs2/ver.h
+++ b/fs/ocfs2/locks.h
@@ -1,9 +1,9 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
- * ver.h
+ * locks.h
*
- * Function prototypes
+ * Function prototypes for Userspace file locking support
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
@@ -23,9 +23,10 @@
* Boston, MA 021110-1307, USA.
*/
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
+#ifndef OCFS2_LOCKS_H
+#define OCFS2_LOCKS_H
-void ocfs2_print_version(void);
+int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
-#endif /* OCFS2_VER_H */
+#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 83934e33e5b..10d66c75cec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,73 +25,169 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/signal.h>
#include <linux/rbtree.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
+#include "aops.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "mmap.h"
+#include "super.h"
+#include "ocfs2_trace.h"
-static struct page *ocfs2_nopage(struct vm_area_struct * area,
- unsigned long address,
- int *type)
+
+static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
{
- struct page *page = NOPAGE_SIGBUS;
- sigset_t blocked, oldset;
+ sigset_t oldset;
int ret;
- mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
- type);
+ ocfs2_block_signals(&oldset);
+ ret = filemap_fault(area, vmf);
+ ocfs2_unblock_signals(&oldset);
+
+ trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+ area, vmf->page, vmf->pgoff);
+ return ret;
+}
+
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
+ struct page *page)
+{
+ int ret = VM_FAULT_NOPAGE;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ loff_t pos = page_offset(page);
+ unsigned int len = PAGE_CACHE_SIZE;
+ pgoff_t last_index;
+ struct page *locked_page = NULL;
+ void *fsdata;
+ loff_t size = i_size_read(inode);
+
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+
+ /*
+ * There are cases that lead to the page no longer bebongs to the
+ * mapping.
+ * 1) pagecache truncates locally due to memory pressure.
+ * 2) pagecache truncates when another is taking EX lock against
+ * inode lock. see ocfs2_data_convert_worker.
+ *
+ * The i_size check doesn't catch the case where nodes truncated and
+ * then re-extended the file. We'll re-check the page mapping after
+ * taking the page lock inside of ocfs2_write_begin_nolock().
+ *
+ * Let VM retry with these cases.
+ */
+ if ((page->mapping != inode->i_mapping) ||
+ (!PageUptodate(page)) ||
+ (page_offset(page) >= size))
+ goto out;
+
+ /*
+ * Call ocfs2_write_begin() and ocfs2_write_end() to take
+ * advantage of the allocation code there. We pass a write
+ * length of the whole page (chopped to i_size) to make sure
+ * the whole thing is allocated.
+ *
+ * Since we know the page is up to date, we don't have to
+ * worry about ocfs2_write_begin() skipping some buffer reads
+ * because the "write" would invalidate their data.
+ */
+ if (page->index == last_index)
+ len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
+
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
+ &fsdata, di_bh, page);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ if (!locked_page) {
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
+ BUG_ON(ret != len);
+ ret = VM_FAULT_LOCKED;
+out:
+ return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct buffer_head *di_bh = NULL;
+ sigset_t oldset;
+ int ret;
- /* The best way to deal with signals in this path is
- * to block them upfront, rather than allowing the
- * locking paths to return -ERESTARTSYS. */
- sigfillset(&blocked);
+ sb_start_pagefault(inode->i_sb);
+ ocfs2_block_signals(&oldset);
- /* We should technically never get a bad ret return
- * from sigprocmask */
- ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+ /*
+ * The cluster locks taken will block a truncate from another
+ * node. Taking the data lock will also ensure that we don't
+ * attempt page truncation as part of a downconvert.
+ */
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- page = filemap_nopage(area, address, type);
+ /*
+ * The alloc sem should be enough to serialize with
+ * ocfs2_truncate_file() changing i_size as well as any thread
+ * modifying the inode btree.
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
- ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
- if (ret < 0)
- mlog_errno(ret);
out:
- mlog_exit_ptr(page);
- return page;
+ ocfs2_unblock_signals(&oldset);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
-static struct vm_operations_struct ocfs2_file_vm_ops = {
- .nopage = ocfs2_nopage,
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
+ .fault = ocfs2_fault,
+ .page_mkwrite = ocfs2_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
- /* We don't want to support shared writable mappings yet. */
- if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
- && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
- mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
- /* This is -EINVAL because generic_file_readonly_mmap
- * returns it in a similar situation. */
- return -EINVAL;
- }
+ int ret = 0, lock_level = 0;
- file_accessed(file);
+ ret = ocfs2_inode_lock_atime(file_inode(file),
+ file->f_path.mnt, &lock_level);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_inode_unlock(file_inode(file), lock_level);
+out:
vma->vm_ops = &ocfs2_file_vm_ops;
return 0;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 00000000000..599eb4c4c8b
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1078 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+ struct inode *inode;
+ struct file *file;
+ int auto_defrag;
+ int partial;
+ int credits;
+ u32 new_phys_cpos;
+ u32 clusters_moved;
+ u64 refcount_loc;
+ struct ocfs2_move_extents *range;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+ struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+ int ext_flags)
+{
+ int ret = 0, index;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_rec *rec, replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+ u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+ ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
+ p_cpos, new_p_cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+ new_p_cpos));
+
+ path = ocfs2_new_path_from_et(&context->et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ rec = &el->l_recs[index];
+
+ BUG_ON(ext_flags != rec->e_flags);
+ /*
+ * after moving/defraging to new location, the extent is not going
+ * to be refcounted anymore.
+ */
+ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ context->et.et_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, &context->et, path, index,
+ &replace_rec, context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+ context->new_phys_cpos = new_p_cpos;
+
+ /*
+ * need I to append truncate log for old clusters?
+ */
+ if (old_blkno) {
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ old_blkno),
+ len, context->meta_ac,
+ &context->dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ old_blkno, len);
+ }
+
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_move,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int extra_blocks,
+ int *credits)
+{
+ int ret, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
+
+ mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+ extra_blocks, clusters_to_move, *credits);
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ * XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 new_phys_cpos, new_len;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ *len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+ &context->meta_ac,
+ &context->data_ac,
+ extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * should be using allocation reservation strategy there?
+ *
+ * if (context->data_ac)
+ * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ */
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
+ ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+ &new_phys_cpos, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * allowing partial extent moving is kind of 'pros and cons', it makes
+ * whole defragmentation less likely to fail, on the contrary, the bad
+ * thing is it may make the fs even more fragmented after moving, let
+ * userspace make a good decision here.
+ */
+ if (new_len != *len) {
+ mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+ if (!partial) {
+ context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+ }
+
+ mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+ phys_cpos, new_phys_cpos);
+
+ ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+ new_phys_cpos, ext_flags);
+ if (ret)
+ mlog_errno(ret);
+
+ if (partial && (new_len != *len))
+ *len = new_len;
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+ u64 vict_blkno,
+ int type, int slot,
+ int *vict_bit,
+ struct buffer_head **ret_bh)
+{
+ int ret, i, bits_per_unit = 0;
+ u64 blkno;
+ char namebuf[40];
+
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_chain_rec *rec;
+ struct ocfs2_dinode *ac_dinode;
+ struct ocfs2_group_desc *bg;
+
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+ strlen(namebuf), &blkno);
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+ cl = &(ac_dinode->id2.i_chain);
+ rec = &(cl->cl_recs[0]);
+
+ if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+ bits_per_unit = osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits;
+ /*
+ * 'vict_blkno' was out of the valid range.
+ */
+ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+ (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+ bits_per_unit))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+ rec = &(cl->cl_recs[i]);
+ if (!rec)
+ continue;
+
+ bg = NULL;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (gd_bh) {
+ brelse(gd_bh);
+ gd_bh = NULL;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+ le16_to_cpu(bg->bg_bits))) {
+
+ *ret_bh = gd_bh;
+ *vict_bit = (vict_blkno - blkno) >>
+ bits_per_unit;
+ mlog(0, "find the victim group: #%llu, "
+ "total_bits: %u, vict_bit: %u\n",
+ blkno, le16_to_cpu(bg->bg_bits),
+ *vict_bit);
+ goto out;
+ }
+
+ } while (le64_to_cpu(bg->bg_next_group));
+ }
+
+ ret = -EINVAL;
+out:
+ brelse(ac_bh);
+
+ /*
+ * caller has to release the gd_bh properly.
+ */
+ return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+ struct ocfs2_move_extents *range)
+{
+ int ret, goal_bit = 0;
+
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int c_to_b = 1 << (osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits);
+
+ /*
+ * make goal become cluster aligned.
+ */
+ range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+ range->me_goal);
+ /*
+ * validate goal sits within global_bitmap, and return the victim
+ * group desc
+ */
+ ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret)
+ goto out;
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ /*
+ * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * let's compromise to the latter cluster.
+ */
+ if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+ range->me_goal += c_to_b;
+
+ /*
+ * movement is not gonna cross two groups.
+ */
+ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+ range->me_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /*
+ * more exact validations/adjustments will be performed later during
+ * moving operation for each extent range.
+ */
+ mlog(0, "extents get ready to be moved to #%llu block\n",
+ range->me_goal);
+
+out:
+ brelse(gd_bh);
+
+ return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+ int *goal_bit, u32 move_len, u32 max_hop,
+ u32 *phys_cpos)
+{
+ int i, used, last_free_bits = 0, base_bit = *goal_bit;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+ u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(gd->bg_blkno));
+
+ for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+ used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+ if (used) {
+ /*
+ * we even tried searching the free chunk by jumping
+ * a 'max_hop' distance, but still failed.
+ */
+ if ((i - base_bit) > max_hop) {
+ *phys_cpos = 0;
+ break;
+ }
+
+ if (last_free_bits)
+ last_free_bits = 0;
+
+ continue;
+ } else
+ last_free_bits++;
+
+ if (last_free_bits == move_len) {
+ *goal_bit = i;
+ *phys_cpos = base_cpos + i;
+ break;
+ }
+ }
+
+ mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+ u32 len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct inode *gb_inode = NULL;
+ struct buffer_head *gb_bh = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *gd;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+ context->range->me_threshold);
+ u64 phys_blkno, new_phys_blkno;
+
+ phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+ &context->meta_ac,
+ NULL, extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * need to count 2 extra credits for global_bitmap inode and
+ * group descriptor.
+ */
+ credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+ /*
+ * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+ * logic, while we still need to lock the global_bitmap.
+ */
+ gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ mutex_lock(&gb_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_gb_mutex;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_tl_inode;
+ }
+
+ new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+ ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * probe the victim cluster group to find a proper
+ * region to fit wanted movement, it even will perfrom
+ * a best-effort attempt by compromising to a threshold
+ * around the goal.
+ */
+ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+ new_phys_cpos);
+ if (!*new_phys_cpos) {
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+
+ ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+ *new_phys_cpos, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+ goal_bit, len);
+ if (ret) {
+ ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ mlog_errno(ret);
+ }
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+ brelse(gd_bh);
+
+out_unlock_tl_inode:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+ mutex_unlock(&gb_inode->i_mutex);
+ brelse(gb_bh);
+ iput(gb_inode);
+
+out:
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+ u32 threshold, int *skip)
+{
+ if ((*alloc_size + *len_defraged) < threshold) {
+ /*
+ * proceed defragmentation until we meet the thresh
+ */
+ *len_defraged += *alloc_size;
+ } else if (*len_defraged == 0) {
+ /*
+ * XXX: skip a large extent.
+ */
+ *skip = 1;
+ } else {
+ /*
+ * split this extent to coalesce with former pieces as
+ * to reach the threshold.
+ *
+ * we're done here with one cycle of defragmentation
+ * in a size of 'thresh', resetting 'len_defraged'
+ * forces a new defragmentation.
+ */
+ *alloc_size = threshold - *len_defraged;
+ *len_defraged = 0;
+ }
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+ struct ocfs2_move_extents_context *context)
+{
+ int ret = 0, flags, do_defrag, skip = 0;
+ u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+ u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_move_extents *range = context->range;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if ((i_size_read(inode) == 0) || (range->me_len == 0))
+ return 0;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+ ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ /*
+ * TO-DO XXX:
+ *
+ * - xattr extents.
+ */
+
+ do_defrag = context->auto_defrag;
+
+ /*
+ * extents moving happens in unit of clusters, for the sake
+ * of simplicity, we may ignore two clusters where 'byte_start'
+ * and 'byte_start + len' were within.
+ */
+ move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+ len_to_move = (range->me_start + range->me_len) >>
+ osb->s_clustersize_bits;
+ if (len_to_move >= move_start)
+ len_to_move -= move_start;
+ else
+ len_to_move = 0;
+
+ if (do_defrag) {
+ defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+ if (defrag_thresh <= 1)
+ goto done;
+ } else
+ new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ range->me_goal);
+
+ mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+ "thresh: %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range->me_start,
+ (unsigned long long)range->me_len,
+ move_start, len_to_move, defrag_thresh);
+
+ cpos = move_start;
+ while (len_to_move) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+ &flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > len_to_move)
+ alloc_size = len_to_move;
+
+ /*
+ * XXX: how to deal with a hole:
+ *
+ * - skip the hole of course
+ * - force a new defragmentation
+ */
+ if (!phys_cpos) {
+ if (do_defrag)
+ len_defraged = 0;
+
+ goto next;
+ }
+
+ if (do_defrag) {
+ ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+ defrag_thresh, &skip);
+ /*
+ * skip large extents
+ */
+ if (skip) {
+ skip = 0;
+ goto next;
+ }
+
+ mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+ "alloc_size: %u, len_defraged: %u\n",
+ cpos, phys_cpos, alloc_size, len_defraged);
+
+ ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+ &alloc_size, flags);
+ } else {
+ ret = ocfs2_move_extent(context, cpos, phys_cpos,
+ &new_phys_cpos, alloc_size,
+ flags);
+
+ new_phys_cpos += alloc_size;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->clusters_moved += alloc_size;
+next:
+ cpos += alloc_size;
+ len_to_move -= alloc_size;
+ }
+
+done:
+ range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+ range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+ context->clusters_moved);
+ range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+ context->new_phys_cpos);
+
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+
+ return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+ int status;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!inode)
+ return -ENOENT;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * This prevents concurrent writes from other nodes
+ */
+ status = ocfs2_rw_lock(inode, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_rw_unlock;
+ }
+
+ /*
+ * rememer ip_xattr_sem also needs to be held if necessary
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ status = __ocfs2_move_extents_range(di_bh, context);
+
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ if (status) {
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ /*
+ * We update ctime for these changes
+ */
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+ ocfs2_rw_unlock(inode, 1);
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+ int status;
+
+ struct inode *inode = file_inode(filp);
+ struct ocfs2_move_extents range;
+ struct ocfs2_move_extents_context *context;
+
+ if (!argp)
+ return -EINVAL;
+
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+
+ if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+ status = -EPERM;
+ goto out_drop;
+ }
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ status = -EPERM;
+ goto out_drop;
+ }
+
+ context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+ if (!context) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_drop;
+ }
+
+ context->inode = inode;
+ context->file = filp;
+
+ if (copy_from_user(&range, argp, sizeof(range))) {
+ status = -EFAULT;
+ goto out_free;
+ }
+
+ if (range.me_start > i_size_read(inode)) {
+ status = -EINVAL;
+ goto out_free;
+ }
+
+ if (range.me_start + range.me_len > i_size_read(inode))
+ range.me_len = i_size_read(inode) - range.me_start;
+
+ context->range = &range;
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+ context->auto_defrag = 1;
+ /*
+ * ok, the default theshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+ context->partial = 1;
+ } else {
+ /*
+ * first best-effort attempt to validate and adjust the goal
+ * (physical address in block), while it can't guarantee later
+ * operation can succeed all the time since global_bitmap may
+ * change a bit over time.
+ */
+
+ status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+ if (status)
+ goto out_copy;
+ }
+
+ status = ocfs2_move_extents(context);
+ if (status)
+ mlog_errno(status);
+out_copy:
+ /*
+ * movement/defragmentation may end up being partially completed,
+ * that's the reason why we need to return userspace the finished
+ * length and new_offset even if failure happens somewhere.
+ */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ status = -EFAULT;
+
+out_free:
+ kfree(context);
+out_drop:
+ mnt_drop_write_file(filp);
+
+ return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 00000000000..4e143e81144
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849c3b4bb94..8add6f1030d 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,8 +40,8 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -60,112 +60,62 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "vote.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-#define NAMEI_RA_CHUNKS 2
-#define NAMEI_RA_BLOCKS 4
-#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
-
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
- struct inode *dir,
- const char *name, int namelen,
- unsigned long offset,
- struct ocfs2_dir_entry **res_dir);
-
-static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
- struct inode *dir,
- struct ocfs2_dir_entry *de_del,
- struct buffer_head *bh);
-
-static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
- struct inode *dir,
- const char *name, int namelen,
- struct inode *inode, u64 blkno,
- struct buffer_head *parent_fe_bh,
- struct buffer_head *insert_bh);
-
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
- struct dentry *dentry, int mode,
+ struct inode *inode,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
- struct ocfs2_journal_handle *handle,
- struct inode **ret_inode,
+ handle_t *handle,
struct ocfs2_alloc_context *inode_ac);
-static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *parent,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_alloc_context *data_ac);
-
-static int ocfs2_double_lock(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct buffer_head **bh1,
- struct inode *inode1,
- struct buffer_head **bh2,
- struct inode *inode2);
-
static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
+ struct inode **ret_orphan_dir,
+ u64 blkno,
char *name,
- struct buffer_head **de_bh);
+ struct ocfs2_dir_lookup_result *lookup);
static int ocfs2_orphan_add(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct inode *inode,
- struct ocfs2_dinode *fe,
+ struct buffer_head *fe_bh,
char *name,
- struct buffer_head *de_bh);
+ struct ocfs2_dir_lookup_result *lookup,
+ struct inode *orphan_dir_inode);
static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct inode *inode,
const char *symname);
-static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
- struct dentry *dentry,
- struct inode *inode, u64 blkno,
- struct buffer_head *parent_fe_bh,
- struct buffer_head *insert_bh)
-{
- return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
- dentry->d_name.name, dentry->d_name.len,
- inode, blkno, parent_fe_bh, insert_bh);
-}
-
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
int status;
u64 blkno;
- struct buffer_head *dirent_bh = NULL;
struct inode *inode = NULL;
struct dentry *ret;
- struct ocfs2_dir_entry *dirent;
struct ocfs2_inode_info *oi;
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
ret = ERR_PTR(-ENAMETOOLONG);
goto bail;
}
- mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
- dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-
- status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+ status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -173,16 +123,13 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
goto bail;
}
- status = ocfs2_find_files_on_disk(dentry->d_name.name,
- dentry->d_name.len, &blkno,
- dir, &dirent_bh, &dirent);
+ status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name,
+ dentry->d_name.len, &blkno);
if (status < 0)
goto bail_add;
- inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
+ inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
if (IS_ERR(inode)) {
- mlog(ML_ERROR, "Unable to create inode %llu\n",
- (unsigned long long)blkno);
ret = ERR_PTR(-EACCES);
goto bail_unlock;
}
@@ -195,11 +142,9 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
* unlink. */
spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock);
bail_add:
- dentry->d_op = &ocfs2_dentry_ops;
ret = d_splice_alias(inode, dentry);
if (inode) {
@@ -223,137 +168,109 @@ bail_add:
ret = ERR_PTR(status);
goto bail_unlock;
}
- }
+ } else
+ ocfs2_dentry_attach_gen(dentry);
bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
* unlink on another node will message us to remove that
* dentry under this lock so otherwise we can race this with
- * the vote thread and have a stale dentry. */
- ocfs2_meta_unlock(dir, 0);
+ * the downconvert thread and have a stale dentry. */
+ ocfs2_inode_unlock(dir, 0);
bail:
- if (dirent_bh)
- brelse(dirent_bh);
- mlog_exit_ptr(ret);
+ trace_ocfs2_lookup_ret(ret);
return ret;
}
-static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *parent,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_alloc_context *data_ac)
+static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
{
- int status;
- struct buffer_head *new_bh = NULL;
- struct ocfs2_dir_entry *de = NULL;
-
- mlog_entry_void();
-
- status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
- data_ac, NULL, &new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ struct inode *inode;
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
-
- status = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- memset(new_bh->b_data, 0, osb->sb->s_blocksize);
-
- de = (struct ocfs2_dir_entry *) new_bh->b_data;
- de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
- de->name_len = 1;
- de->rec_len =
- cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
- strcpy(de->name, ".");
- ocfs2_set_de_type(de, S_IFDIR);
- de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
- de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
- de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
- OCFS2_DIR_REC_LEN(1));
- de->name_len = 2;
- strcpy(de->name, "..");
- ocfs2_set_de_type(de, S_IFDIR);
-
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- i_size_write(inode, inode->i_sb->s_blocksize);
- inode->i_nlink = 2;
- inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
- status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ inode = new_inode(dir->i_sb);
+ if (!inode) {
+ mlog(ML_ERROR, "new_inode failed!\n");
+ return NULL;
}
- status = 0;
-bail:
- if (new_bh)
- brelse(new_bh);
+ /* populate as many fields early on as possible - many of
+ * these are used by the support functions here and in
+ * callers. */
+ if (S_ISDIR(mode))
+ set_nlink(inode, 2);
+ inode_init_owner(inode, dir, mode);
+ dquot_initialize(inode);
+ return inode;
+}
- mlog_exit(status);
- return status;
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+ struct dentry *dentry, struct inode *inode)
+{
+ struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
}
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
dev_t dev)
{
int status = 0;
struct buffer_head *parent_fe_bh = NULL;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
struct ocfs2_super *osb;
struct ocfs2_dinode *dirfe;
struct buffer_head *new_fe_bh = NULL;
- struct buffer_head *de_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
-
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
- (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ int want_clusters = 0;
+ int want_meta = 0;
+ int xattr_credits = 0;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+ int did_quota_inode = 0;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
+ struct ocfs2_dentry_lock *dl = NULL;
+
+ trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long)dev, mode);
+
+ dquot_initialize(dir);
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto leave;
- }
-
- status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+ status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
- goto leave;
+ return status;
}
- if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+ if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
status = -EMLINK;
goto leave;
}
dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
- if (!dirfe->i_links_count) {
+ if (!ocfs2_read_links_count(dirfe)) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
goto leave;
@@ -367,32 +284,79 @@ static int ocfs2_mknod(struct inode *dir,
/* get a spot inside the dir. */
status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
dentry->d_name.name,
- dentry->d_name.len, &de_bh);
+ dentry->d_name.len, &lookup);
if (status < 0) {
mlog_errno(status);
goto leave;
}
/* reserve an inode spot */
- status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+ status = ocfs2_reserve_new_inode(osb, &inode_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto leave;
}
- /* are we making a directory? If so, reserve a cluster for his
- * 1st extent. */
- if (S_ISDIR(mode)) {
- status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ inode = ocfs2_get_init_inode(dir, mode);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* get security xattr */
+ status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ si.enable = 0;
+ else {
+ mlog_errno(status);
goto leave;
}
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
+ /* calculate meta data/clusters for setting security and acl xattr */
+ status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+ &si, &want_clusters,
+ &xattr_credits, &want_meta);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* Reserve a cluster if creating an extent based directory. */
+ if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+ want_clusters += 1;
+
+ /* Dir indexing requires extra space as well */
+ if (ocfs2_supports_indexed_dirs(osb))
+ want_meta++;
+ }
+
+ status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+ S_ISDIR(mode),
+ xattr_credits));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -400,10 +364,19 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
+ status = dquot_alloc_inode(inode);
+ if (status)
+ goto leave;
+ did_quota_inode = 1;
+
/* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+ status = ocfs2_mknod_locked(osb, dir, inode, dev,
&new_fe_bh, parent_fe_bh, handle,
- &inode, inode_ac);
+ inode_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -411,35 +384,54 @@ static int ocfs2_mknod(struct inode *dir,
if (S_ISDIR(mode)) {
status = ocfs2_fill_new_dir(osb, handle, dir, inode,
- new_fe_bh, data_ac);
+ new_fe_bh, data_ac, meta_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_access(handle, dir, parent_fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
- le16_add_cpu(&dirfe->i_links_count, 1);
- status = ocfs2_journal_dirty(handle, parent_fe_bh);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
+ parent_fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- dir->i_nlink++;
+ ocfs2_add_links_count(dirfe, 1);
+ ocfs2_journal_dirty(handle, parent_fe_bh);
+ inc_nlink(dir);
+ }
+
+ if (default_acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_DEFAULT, default_acl,
+ meta_ac, data_ac);
+ }
+ if (!status && acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_ACCESS, acl,
+ meta_ac, data_ac);
}
- status = ocfs2_add_entry(handle, dentry, inode,
- OCFS2_I(inode)->ip_blkno, parent_fe_bh,
- de_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
+ if (si.enable) {
+ status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+ meta_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+ }
+
+ /*
+ * Do this before adding the entry to the directory. We add
+ * also set d_op after success so that ->d_iput() will cleanup
+ * the dentry lock even if ocfs2_add_entry() fails below.
+ */
status = ocfs2_dentry_attach_lock(dentry, inode,
OCFS2_I(dir)->ip_blkno);
if (status) {
@@ -447,28 +439,38 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ dl = dentry->d_fsdata;
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+ &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
insert_inode_hash(inode);
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
status = 0;
leave:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
+ if (status < 0 && did_quota_inode)
+ dquot_free_inode(inode);
if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
- if (status == -ENOSPC)
- mlog(0, "Disk is full\n");
+ ocfs2_inode_unlock(dir, 1);
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
- if (new_fe_bh)
- brelse(new_fe_bh);
+ brelse(new_fe_bh);
+ brelse(parent_fe_bh);
+ kfree(si.value);
- if (de_bh)
- brelse(de_bh);
-
- if (parent_fe_bh)
- brelse(parent_fe_bh);
-
- if ((status < 0) && inode)
- iput(inode);
+ ocfs2_free_dir_lookup_result(&lookup);
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
@@ -476,73 +478,67 @@ leave:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
- mlog_exit(status);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ /*
+ * We should call iput after the i_mutex of the bitmap been
+ * unlocked in ocfs2_free_alloc_context, or the
+ * ocfs2_delete_inode will mutex_lock again.
+ */
+ if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+ clear_nlink(inode);
+ iput(inode);
+ }
+
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
- struct inode *dir,
- struct dentry *dentry, int mode,
- dev_t dev,
- struct buffer_head **new_fe_bh,
- struct buffer_head *parent_fe_bh,
- struct ocfs2_journal_handle *handle,
- struct inode **ret_inode,
- struct ocfs2_alloc_context *inode_ac)
+static int __ocfs2_mknod_locked(struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac,
+ u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
- u64 fe_blkno = 0;
- u16 suballoc_bit;
- struct inode *inode = NULL;
-
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
- (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
+ u16 feat;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
*new_fe_bh = NULL;
- *ret_inode = NULL;
-
- status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
- &fe_blkno);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- inode = new_inode(dir->i_sb);
- if (IS_ERR(inode)) {
- status = PTR_ERR(inode);
- mlog(ML_ERROR, "new_inode failed!\n");
- goto leave;
- }
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
OCFS2_I(inode)->ip_blkno = fe_blkno;
- if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
- inode->i_mode = mode;
spin_lock(&osb->osb_lock);
inode->i_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
if (!*new_fe_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto leave;
}
- ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
- status = ocfs2_journal_access(handle, inode, *new_fe_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ *new_fe_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -554,97 +550,124 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_generation = cpu_to_le32(inode->i_generation);
fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
fe->i_blkno = cpu_to_le64(fe_blkno);
+ fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
- fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
- fe->i_uid = cpu_to_le32(current->fsuid);
- if (dir->i_mode & S_ISGID) {
- fe->i_gid = cpu_to_le32(dir->i_gid);
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- fe->i_gid = cpu_to_le32(current->fsgid);
- fe->i_mode = cpu_to_le16(mode);
- if (S_ISCHR(mode) || S_ISBLK(mode))
+ fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
+ fe->i_mode = cpu_to_le16(inode->i_mode);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
- fe->i_links_count = cpu_to_le16(inode->i_nlink);
+ ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
- le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(CURRENT_TIME.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
cpu_to_le32(CURRENT_TIME.tv_nsec);
fe->i_dtime = 0;
- fel = &fe->id2.i_list;
- fel->l_tree_depth = 0;
- fel->l_next_free_rec = 0;
- fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+ /*
+ * If supported, directories start with inline data. If inline
+ * isn't supported, but indexing is, we start them as indexed.
+ */
+ feat = le16_to_cpu(fe->i_dyn_features);
+ if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
+ fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
- status = ocfs2_journal_dirty(handle, *new_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
+ fe->id2.i_data.id_count = cpu_to_le16(
+ ocfs2_max_inline_data_with_xattr(osb->sb, fe));
+ } else {
+ fel = &fe->id2.i_list;
+ fel->l_tree_depth = 0;
+ fel->l_next_free_rec = 0;
+ fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
}
- if (ocfs2_populate_inode(inode, fe, 1) < 0) {
- mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
- "i_blkno=%llu, i_ino=%lu\n",
- (unsigned long long) (*new_fe_bh)->b_blocknr,
- (unsigned long long)fe->i_blkno, inode->i_ino);
- BUG();
- }
+ ocfs2_journal_dirty(handle, *new_fe_bh);
- ocfs2_inode_set_new(osb, inode);
- status = ocfs2_create_new_inode_locks(inode);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_populate_inode(inode, fe, 1);
+ ocfs2_ci_set_new(osb, INODE_CACHE(inode));
+ if (!ocfs2_mount_local(osb)) {
+ status = ocfs2_create_new_inode_locks(inode);
+ if (status < 0)
+ mlog_errno(status);
+ }
- status = 0; /* error in ocfs2_create_new_inode_locks is not
- * critical */
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
- *ret_inode = inode;
leave:
if (status < 0) {
if (*new_fe_bh) {
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
- if (inode)
- iput(inode);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+ struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac)
+{
+ int status = 0;
+ u64 suballoc_loc, fe_blkno = 0;
+ u16 suballoc_bit;
+
+ *new_fe_bh = NULL;
+
+ status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+ inode_ac, &suballoc_loc,
+ &suballoc_bit, &fe_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ parent_fe_bh, handle, inode_ac,
+ fe_blkno, suballoc_loc, suballoc_bit);
+}
+
static int ocfs2_mkdir(struct inode *dir,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
int ret;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ OCFS2_I(dir)->ip_blkno, mode);
ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
static int ocfs2_create(struct inode *dir,
struct dentry *dentry,
- int mode,
- struct nameidata *nd)
+ umode_t mode,
+ bool excl)
{
int ret;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
@@ -653,127 +676,140 @@ static int ocfs2_link(struct dentry *old_dentry,
struct inode *dir,
struct dentry *dentry)
{
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle;
struct inode *inode = old_dentry->d_inode;
int err;
struct buffer_head *fe_bh = NULL;
struct buffer_head *parent_fe_bh = NULL;
- struct buffer_head *de_bh = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ u64 old_de_ino;
- mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
- old_dentry->d_name.len, old_dentry->d_name.name,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ dentry->d_name.len, dentry->d_name.name);
- if (S_ISDIR(inode->i_mode)) {
- err = -EPERM;
- goto bail;
- }
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- err = -ENOMEM;
- goto bail;
- }
+ dquot_initialize(dir);
- err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+ err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
- goto bail;
+ return err;
}
if (!dir->i_nlink) {
err = -ENOENT;
- goto bail;
+ goto out;
+ }
+
+ err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ old_dentry->d_name.len, &old_de_ino);
+ if (err) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * Check whether another node removed the source inode while we
+ * were in the vfs.
+ */
+ if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+ err = -ENOENT;
+ goto out;
}
err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
dentry->d_name.len);
if (err)
- goto bail;
+ goto out;
err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
dentry->d_name.name,
- dentry->d_name.len, &de_bh);
+ dentry->d_name.len, &lookup);
if (err < 0) {
mlog_errno(err);
- goto bail;
+ goto out;
}
- err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+ err = ocfs2_inode_lock(inode, &fe_bh, 1);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
- goto bail;
+ goto out;
}
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+ if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
err = -EMLINK;
- goto bail;
+ goto out_unlock_inode;
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
handle = NULL;
mlog_errno(err);
- goto bail;
+ goto out_unlock_inode;
}
- err = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+
+ err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
mlog_errno(err);
- goto bail;
+ goto out_commit;
}
- inode->i_nlink++;
+ inc_nlink(inode);
inode->i_ctime = CURRENT_TIME;
- fe->i_links_count = cpu_to_le16(inode->i_nlink);
+ ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
- err = ocfs2_journal_dirty(handle, fe_bh);
- if (err < 0) {
- le16_add_cpu(&fe->i_links_count, -1);
- inode->i_nlink--;
- mlog_errno(err);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno,
- parent_fe_bh, de_bh);
+ parent_fe_bh, &lookup);
if (err) {
- le16_add_cpu(&fe->i_links_count, -1);
- inode->i_nlink--;
+ ocfs2_add_links_count(fe, -1);
+ drop_nlink(inode);
mlog_errno(err);
- goto bail;
+ goto out_commit;
}
err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
if (err) {
mlog_errno(err);
- goto bail;
+ goto out_commit;
}
- atomic_inc(&inode->i_count);
- dentry->d_op = &ocfs2_dentry_ops;
+ ihold(inode);
d_instantiate(dentry, inode);
-bail:
- if (handle)
- ocfs2_commit_trans(handle);
- if (de_bh)
- brelse(de_bh);
- if (fe_bh)
- brelse(fe_bh);
- if (parent_fe_bh)
- brelse(parent_fe_bh);
- mlog_exit(err);
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+ ocfs2_unblock_signals(&oldset);
+out_unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+
+out:
+ ocfs2_inode_unlock(dir, 1);
+
+ brelse(fe_bh);
+ brelse(parent_fe_bh);
+
+ ocfs2_free_dir_lookup_result(&lookup);
+
+ if (err)
+ mlog_errno(err);
return err;
}
@@ -795,53 +831,60 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
return ret;
}
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
+{
+ if (S_ISDIR(inode->i_mode)) {
+ if (inode->i_nlink == 2)
+ return 1;
+ return 0;
+ }
+
+ if (inode->i_nlink == 1)
+ return 1;
+ return 0;
+}
+
static int ocfs2_unlink(struct inode *dir,
struct dentry *dentry)
{
int status;
- unsigned int saved_nlink = 0;
+ int child_locked = 0;
+ bool is_unlinkable = false;
struct inode *inode = dentry->d_inode;
+ struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
u64 blkno;
struct ocfs2_dinode *fe = NULL;
struct buffer_head *fe_bh = NULL;
struct buffer_head *parent_node_bh = NULL;
- struct ocfs2_journal_handle *handle = NULL;
- struct ocfs2_dir_entry *dirent = NULL;
- struct buffer_head *dirent_bh = NULL;
+ handle_t *handle = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
- struct buffer_head *orphan_entry_bh = NULL;
-
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
- BUG_ON(dentry->d_parent->d_inode != dir);
+ trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ dquot_initialize(dir);
- if (inode == osb->root_inode) {
- mlog(0, "Cannot delete the root directory\n");
- status = -EPERM;
- goto leave;
- }
+ BUG_ON(dentry->d_parent->d_inode != dir);
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto leave;
- }
+ if (inode == osb->root_inode)
+ return -EPERM;
- status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
+ status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
+ OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
- goto leave;
+ return status;
}
status = ocfs2_find_files_on_disk(dentry->d_name.name,
- dentry->d_name.len, &blkno,
- dir, &dirent_bh, &dirent);
+ dentry->d_name.len, &blkno, dir,
+ &lookup);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -851,58 +894,48 @@ static int ocfs2_unlink(struct inode *dir,
if (OCFS2_I(inode)->ip_blkno != blkno) {
status = -ENOENT;
- mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
+ trace_ocfs2_unlink_noent(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno,
+ OCFS2_I(inode)->ip_flags);
goto leave;
}
- status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+ status = ocfs2_inode_lock(inode, &fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto leave;
}
+ child_locked = 1;
if (S_ISDIR(inode->i_mode)) {
- if (!ocfs2_empty_dir(inode)) {
- status = -ENOTEMPTY;
- goto leave;
- } else if (inode->i_nlink != 2) {
+ if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
status = -ENOTEMPTY;
goto leave;
}
}
- /* There are still a few steps left until we can consider the
- * unlink to have succeeded. Save off nlink here before
- * modification so we can set it back in case we hit an issue
- * before commit. */
- saved_nlink = inode->i_nlink;
- if (S_ISDIR(inode->i_mode))
- inode->i_nlink = 0;
- else
- inode->i_nlink--;
-
status = ocfs2_remote_dentry_delete(dentry);
if (status < 0) {
- /* This vote should succeed under all normal
+ /* This remote delete should succeed under all normal
* circumstances. */
mlog_errno(status);
goto leave;
}
- if (!inode->i_nlink) {
- status = ocfs2_prepare_orphan_dir(osb, handle, inode,
- orphan_name,
- &orphan_entry_bh);
+ if (ocfs2_inode_is_unlinkable(inode)) {
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+ OCFS2_I(inode)->ip_blkno,
+ orphan_name, &orphan_insert);
if (status < 0) {
mlog_errno(status);
goto leave;
}
+ is_unlinkable = true;
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -910,8 +943,8 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -919,100 +952,171 @@ static int ocfs2_unlink(struct inode *dir,
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- if (!inode->i_nlink) {
- status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
- orphan_entry_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
- }
-
/* delete the name from the parent dir */
- status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+ status = ocfs2_delete_entry(handle, dir, &lookup);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- /* We can set nlink on the dinode now. clear the saved version
- * so that it doesn't get set later. */
- fe->i_links_count = cpu_to_le16(inode->i_nlink);
- saved_nlink = 0;
+ if (S_ISDIR(inode->i_mode))
+ drop_nlink(inode);
+ drop_nlink(inode);
+ ocfs2_set_links_count(fe, inode->i_nlink);
+ ocfs2_journal_dirty(handle, fe_bh);
+
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ if (S_ISDIR(inode->i_mode))
+ drop_nlink(dir);
- status = ocfs2_journal_dirty(handle, fe_bh);
+ status = ocfs2_mark_inode_dirty(handle, dir, parent_node_bh);
if (status < 0) {
mlog_errno(status);
+ if (S_ISDIR(inode->i_mode))
+ inc_nlink(dir);
goto leave;
}
- if (S_ISDIR(inode->i_mode)) {
- dir->i_nlink--;
- status = ocfs2_mark_inode_dirty(handle, dir,
- parent_node_bh);
- if (status < 0) {
+ if (is_unlinkable) {
+ status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+ orphan_name, &orphan_insert, orphan_dir);
+ if (status < 0)
mlog_errno(status);
- dir->i_nlink++;
- }
}
leave:
- if (status < 0 && saved_nlink)
- inode->i_nlink = saved_nlink;
-
if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
+
+ if (child_locked)
+ ocfs2_inode_unlock(inode, 1);
- if (fe_bh)
- brelse(fe_bh);
+ ocfs2_inode_unlock(dir, 1);
- if (dirent_bh)
- brelse(dirent_bh);
+ if (orphan_dir) {
+ /* This was locked for us in ocfs2_prepare_orphan_dir() */
+ ocfs2_inode_unlock(orphan_dir, 1);
+ mutex_unlock(&orphan_dir->i_mutex);
+ iput(orphan_dir);
+ }
- if (parent_node_bh)
- brelse(parent_node_bh);
+ brelse(fe_bh);
+ brelse(parent_node_bh);
- if (orphan_entry_bh)
- brelse(orphan_entry_bh);
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+ ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
+ mlog_errno(status);
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog(ML_NOTICE, "max lookup times reached, filesystem "
+ "may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
*/
static int ocfs2_double_lock(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
struct inode *inode2)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
struct inode *tmpinode;
- mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
- (unsigned long long)oi1->ip_blkno,
- (unsigned long long)oi2->ip_blkno);
-
- BUG_ON(!handle);
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
if (*bh1)
*bh1 = NULL;
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
+
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
- mlog(0, "switching them around...\n");
tmpbh = bh2;
bh2 = bh1;
bh1 = tmpbh;
@@ -1022,62 +1126,87 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
inode1 = tmpinode;
}
/* lock id2 */
- status = ocfs2_meta_lock(inode2, handle, bh2, 1);
+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ OI_LS_RENAME1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
}
+
/* lock id1 */
- status = ocfs2_meta_lock(inode1, handle, bh1, 1);
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
if (status < 0) {
+ /*
+ * An error return must mean that no cluster locks
+ * were held on function exit.
+ */
+ if (oi1->ip_blkno != oi2->ip_blkno) {
+ ocfs2_inode_unlock(inode2, 1);
+ brelse(*bh2);
+ *bh2 = NULL;
+ }
+
if (status != -ENOENT)
mlog_errno(status);
- goto bail;
}
+
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-#define PARENT_INO(buffer) \
- ((struct ocfs2_dir_entry *) \
- ((char *)buffer + \
- le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+ ocfs2_inode_unlock(inode1, 1);
+
+ if (inode1 != inode2)
+ ocfs2_inode_unlock(inode2, 1);
+}
static int ocfs2_rename(struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
struct dentry *new_dentry)
{
- int status = 0, rename_lock = 0;
+ int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
+ int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
+ struct inode *orphan_dir = NULL;
struct ocfs2_dinode *newfe = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
- struct buffer_head *orphan_entry_bh = NULL;
struct buffer_head *newfe_bh = NULL;
- struct buffer_head *insert_entry_bh = NULL;
+ struct buffer_head *old_inode_bh = NULL;
struct ocfs2_super *osb = NULL;
- u64 newfe_blkno;
- struct ocfs2_journal_handle *handle = NULL;
+ u64 newfe_blkno, old_de_ino;
+ handle_t *handle = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *new_dir_bh = NULL;
- struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
- // and new_dentry
- struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
- struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
- // this is the 1st dirent bh
- nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
+ u32 old_dir_nlink = old_dir->i_nlink;
+ struct ocfs2_dinode *old_di;
+ struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+ struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+ struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ struct ocfs2_dir_lookup_result target_insert = { NULL, };
+ bool should_add_orphan = false;
/* At some point it might be nice to break this function up a
* bit. */
- mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
- old_dir, old_dentry, new_dir, new_dentry,
- old_dentry->d_name.len, old_dentry->d_name.name,
- new_dentry->d_name.len, new_dentry->d_name.name);
+ trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ new_dentry->d_name.len, new_dentry->d_name.name);
+
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
osb = OCFS2_SB(old_dir->i_sb);
@@ -1086,15 +1215,7 @@ static int ocfs2_rename(struct inode *old_dir,
BUG();
}
- if (atomic_read(&old_dentry->d_count) > 2) {
- shrink_dcache_parent(old_dentry);
- if (atomic_read(&old_dentry->d_count) > 2) {
- status = -EBUSY;
- goto bail;
- }
- }
-
- /* Assume a directory heirarchy thusly:
+ /* Assume a directory hierarchy thusly:
* a/b/c
* a/d
* a,b,c, and d are all directories.
@@ -1105,30 +1226,38 @@ static int ocfs2_rename(struct inode *old_dir,
*
* And that's why, just like the VFS, we need a file system
* rename lock. */
- if (old_dentry != new_dentry) {
+ if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
status = ocfs2_rename_lock(osb);
if (status < 0) {
mlog_errno(status);
goto bail;
}
rename_lock = 1;
- }
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ trace_ocfs2_rename_not_permitted(
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
- status = ocfs2_double_lock(osb, handle,
- &old_dir_bh, old_dir,
- &new_dir_bh, new_dir);
+ status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+ &new_dir_bh, new_dir);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ parents_locked = 1;
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
@@ -1144,17 +1273,19 @@ static int ocfs2_rename(struct inode *old_dir,
}
/*
- * Though we don't require an inode meta data update if
- * old_inode is not a directory, we lock anyway here to ensure
- * the vote thread on other nodes won't have to concurrently
- * downconvert the inode and the dentry locks.
+ * Aside from allowing a meta data update, the locking here
+ * also ensures that the downconvert thread on other nodes
+ * won't have to concurrently downconvert the inode and the
+ * dentry locks.
*/
- status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+ status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
+ OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
+ old_child_locked = 1;
status = ocfs2_remote_dentry_delete(old_dentry);
if (status < 0) {
@@ -1163,27 +1294,36 @@ static int ocfs2_rename(struct inode *old_dir,
}
if (S_ISDIR(old_inode->i_mode)) {
- status = -EIO;
- old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
- if (!old_inode_de_bh)
+ u64 old_inode_parent;
+
+ update_dot_dot = 1;
+ status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
+ old_inode,
+ &old_inode_dot_dot_res);
+ if (status) {
+ status = -EIO;
goto bail;
+ }
- status = -EIO;
- if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
- OCFS2_I(old_dir)->ip_blkno)
+ if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) {
+ status = -EIO;
goto bail;
- status = -EMLINK;
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= OCFS2_LINK_MAX)
+ }
+
+ if (!new_inode && new_dir != old_dir &&
+ new_dir->i_nlink >= ocfs2_link_max(osb)) {
+ status = -EMLINK;
goto bail;
+ }
}
- status = -ENOENT;
- old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
- old_dentry->d_name.len,
- old_dir, &old_de);
- if (!old_de_bh)
+ status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
+ old_dentry->d_name.len,
+ &old_de_ino);
+ if (status) {
+ status = -ENOENT;
goto bail;
+ }
/*
* Check for inode number is _not_ due to possible IO errors.
@@ -1191,15 +1331,17 @@ static int ocfs2_rename(struct inode *old_dir,
* and merrily kill the link to whatever was created under the
* same name. Goodbye sticky bit ;-<
*/
- if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+ if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) {
+ status = -ENOENT;
goto bail;
+ }
/* check if the target already exists (in which case we need
* to delete it */
status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
new_dentry->d_name.len,
- &newfe_blkno, new_dir, &new_de_bh,
- &new_de);
+ &newfe_blkno, new_dir,
+ &target_lookup_res);
/* The only error we allow here is -ENOENT because the new
* file not existing is perfectly valid. */
if ((status < 0) && (status != -ENOENT)) {
@@ -1208,14 +1350,23 @@ static int ocfs2_rename(struct inode *old_dir,
mlog_errno(status);
goto bail;
}
+ if (status == 0)
+ target_exists = 1;
- if (!new_de && new_inode)
- mlog(ML_ERROR, "inode %lu does not exist in it's parent "
- "directory!", new_inode->i_ino);
+ if (!target_exists && new_inode) {
+ /*
+ * Target was unlinked by another node while we were
+ * waiting to get to ocfs2_rename(). There isn't
+ * anything we can do here to help the situation, so
+ * bubble up the appropriate error.
+ */
+ status = -ENOENT;
+ goto bail;
+ }
/* In case we need to overwrite an existing file, we blow it
* away first */
- if (new_de) {
+ if (target_exists) {
/* VFS didn't think there existed an inode here, but
* someone else in the cluster must have raced our
* rename to create one. Today we error cleanly, in
@@ -1224,28 +1375,28 @@ static int ocfs2_rename(struct inode *old_dir,
if (!new_inode) {
status = -EACCES;
- mlog(0, "We found an inode for name %.*s but VFS "
- "didn't give us one.\n", new_dentry->d_name.len,
- new_dentry->d_name.name);
+ trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+ new_dentry->d_name.name);
goto bail;
}
if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
status = -EACCES;
- mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
+ trace_ocfs2_rename_disagree(
(unsigned long long)OCFS2_I(new_inode)->ip_blkno,
(unsigned long long)newfe_blkno,
OCFS2_I(new_inode)->ip_flags);
goto bail;
}
- status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
+ status = ocfs2_inode_lock(new_inode, &newfe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto bail;
}
+ new_child_locked = 1;
status = ocfs2_remote_dentry_delete(new_dentry);
if (status < 0) {
@@ -1255,20 +1406,19 @@ static int ocfs2_rename(struct inode *old_dir,
newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
- mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
- "newfebh=%p bhblocknr=%llu\n", new_de,
+ trace_ocfs2_rename_over_existing(
(unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
(unsigned long long)newfe_bh->b_blocknr : 0ULL);
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
- status = ocfs2_prepare_orphan_dir(osb, handle,
- new_inode,
- orphan_name,
- &orphan_entry_bh);
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+ OCFS2_I(new_inode)->ip_blkno,
+ orphan_name, &orphan_insert);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ should_add_orphan = true;
}
} else {
BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1282,14 +1432,14 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
new_dentry->d_name.name,
new_dentry->d_name.len,
- &insert_entry_bh);
+ &target_insert);
if (status < 0) {
mlog_errno(status);
goto bail;
}
}
- handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -1297,117 +1447,120 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (new_de) {
+ if (target_exists) {
if (S_ISDIR(new_inode->i_mode)) {
- if (!ocfs2_empty_dir(new_inode) ||
- new_inode->i_nlink != 2) {
+ if (new_inode->i_nlink != 2 ||
+ !ocfs2_empty_dir(new_inode)) {
status = -ENOTEMPTY;
goto bail;
}
}
- status = ocfs2_journal_access(handle, new_inode, newfe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
+ newfe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- if (S_ISDIR(new_inode->i_mode) ||
- (newfe->i_links_count == cpu_to_le16(1))){
- status = ocfs2_orphan_add(osb, handle, new_inode,
- newfe, orphan_name,
- orphan_entry_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
/* change the dirent to point to the correct inode */
- status = ocfs2_journal_access(handle, new_dir, new_de_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
+ old_inode);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
- new_de->file_type = old_de->file_type;
new_dir->i_version++;
- status = ocfs2_journal_dirty(handle, new_de_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
if (S_ISDIR(new_inode->i_mode))
- newfe->i_links_count = 0;
+ ocfs2_set_links_count(newfe, 0);
else
- le16_add_cpu(&newfe->i_links_count, -1);
-
- status = ocfs2_journal_dirty(handle, newfe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ ocfs2_add_links_count(newfe, -1);
+ ocfs2_journal_dirty(handle, newfe_bh);
+ if (should_add_orphan) {
+ status = ocfs2_orphan_add(osb, handle, new_inode,
+ newfe_bh, orphan_name,
+ &orphan_insert, orphan_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
} else {
/* if the name was not found in new_dir, add it now */
status = ocfs2_add_entry(handle, new_dentry, old_inode,
OCFS2_I(old_inode)->ip_blkno,
- new_dir_bh, insert_entry_bh);
+ new_dir_bh, &target_insert);
}
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
- /* now that the name has been added to new_dir, remove the old name */
- status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
+ old_inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status >= 0) {
+ old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
+
+ old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
+ old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+ ocfs2_journal_dirty(handle, old_inode_bh);
+ } else
+ mlog_errno(status);
+
+ /*
+ * Now that the name has been added to new_dir, remove the old name.
+ *
+ * We don't keep any directory entry context around until now
+ * because the insert might have changed the type of directory
+ * we're dealing with.
+ */
+ status = ocfs2_find_entry(old_dentry->d_name.name,
+ old_dentry->d_name.len, old_dir,
+ &old_entry_lookup);
+ if (status)
+ goto bail;
+
+ status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
new_inode->i_ctime = CURRENT_TIME;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
- if (old_inode_de_bh) {
- status = ocfs2_journal_access(handle, old_inode,
- old_inode_de_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- PARENT_INO(old_inode_de_bh->b_data) =
- cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
- status = ocfs2_journal_dirty(handle, old_inode_de_bh);
- old_dir->i_nlink--;
+
+ if (update_dot_dot) {
+ status = ocfs2_update_entry(old_inode, handle,
+ &old_inode_dot_dot_res, new_dir);
+ drop_nlink(old_dir);
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
} else {
- new_dir->i_nlink++;
+ inc_nlink(new_dir);
mark_inode_dirty(new_dir);
}
}
mark_inode_dirty(old_dir);
- if (new_inode)
+ ocfs2_mark_inode_dirty(handle, old_dir, old_dir_bh);
+ if (new_inode) {
mark_inode_dirty(new_inode);
+ ocfs2_mark_inode_dirty(handle, new_inode, newfe_bh);
+ }
- if (old_dir != new_dir)
- if (new_dir_nlink != new_dir->i_nlink) {
- if (!new_dir_bh) {
- mlog(ML_ERROR, "need to change nlink for new "
- "dir %llu from %d to %d but bh is NULL\n",
- (unsigned long long)OCFS2_I(new_dir)->ip_blkno,
- (int)new_dir_nlink, new_dir->i_nlink);
- } else {
- struct ocfs2_dinode *fe;
- status = ocfs2_journal_access(handle,
- new_dir,
- new_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
- fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
- status = ocfs2_journal_dirty(handle, new_dir_bh);
- }
- }
+ if (old_dir != new_dir) {
+ /* Keep the same times on both directories.*/
+ new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
+
+ /*
+ * This will also pick up the i_nlink change from the
+ * block above.
+ */
+ ocfs2_mark_inode_dirty(handle, new_dir, new_dir_bh);
+ }
if (old_dir_nlink != old_dir->i_nlink) {
if (!old_dir_bh) {
@@ -1417,15 +1570,15 @@ static int ocfs2_rename(struct inode *old_dir,
(int)old_dir_nlink, old_dir->i_nlink);
} else {
struct ocfs2_dinode *fe;
- status = ocfs2_journal_access(handle, old_dir,
- old_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(old_dir),
+ old_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
- fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
- status = ocfs2_journal_dirty(handle, old_dir_bh);
+ ocfs2_set_links_count(fe, old_dir->i_nlink);
+ ocfs2_journal_dirty(handle, old_dir_bh);
}
}
-
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
status = 0;
bail:
@@ -1433,31 +1586,43 @@ bail:
ocfs2_rename_unlock(osb);
if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
+
+ if (parents_locked)
+ ocfs2_double_unlock(old_dir, new_dir);
+
+ if (old_child_locked)
+ ocfs2_inode_unlock(old_inode, 1);
+
+ if (new_child_locked)
+ ocfs2_inode_unlock(new_inode, 1);
+
+ if (orphan_dir) {
+ /* This was locked for us in ocfs2_prepare_orphan_dir() */
+ ocfs2_inode_unlock(orphan_dir, 1);
+ mutex_unlock(&orphan_dir->i_mutex);
+ iput(orphan_dir);
+ }
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
if (new_inode)
iput(new_inode);
- if (newfe_bh)
- brelse(newfe_bh);
- if (old_dir_bh)
- brelse(old_dir_bh);
- if (new_dir_bh)
- brelse(new_dir_bh);
- if (new_de_bh)
- brelse(new_de_bh);
- if (old_de_bh)
- brelse(old_de_bh);
- if (old_inode_de_bh)
- brelse(old_inode_de_bh);
- if (orphan_entry_bh)
- brelse(orphan_entry_bh);
- if (insert_entry_bh)
- brelse(insert_entry_bh);
-
- mlog_exit(status);
+
+ ocfs2_free_dir_lookup_result(&target_lookup_res);
+ ocfs2_free_dir_lookup_result(&old_entry_lookup);
+ ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+ ocfs2_free_dir_lookup_result(&target_insert);
+
+ brelse(newfe_bh);
+ brelse(old_inode_bh);
+ brelse(old_dir_bh);
+ brelse(new_dir_bh);
+
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1467,15 +1632,14 @@ bail:
* data, including the null terminator.
*/
static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct inode *inode,
const char *symname)
{
struct buffer_head **bhs = NULL;
const char *c;
struct super_block *sb = osb->sb;
- u64 p_blkno;
- int p_blocks;
+ u64 p_blkno, p_blocks;
int virtual, blocks, status, i, bytes_left;
bytes_left = i_size_read(inode) + 1;
@@ -1483,9 +1647,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
* write i_size + 1 bytes. */
blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
- mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
- (unsigned long long)inode->i_blocks,
- i_size_read(inode), blocks);
+ trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+ i_size_read(inode), blocks);
/* Sanity check -- make sure we're going to fit. */
if (bytes_left >
@@ -1502,8 +1665,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
- &p_blocks);
+ status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+ NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1528,9 +1691,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
+ bhs[virtual]);
- status = ocfs2_journal_access(handle, inode, bhs[virtual],
+ status = ocfs2_journal_access(handle, INODE_CACHE(inode),
+ bhs[virtual],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1543,11 +1708,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
(bytes_left > sb->s_blocksize) ? sb->s_blocksize :
bytes_left);
- status = ocfs2_journal_dirty(handle, bhs[virtual]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[virtual]);
virtual++;
p_blkno++;
@@ -1559,12 +1720,12 @@ bail:
if (bhs) {
for(i = 0; i < blocks; i++)
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
kfree(bhs);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1578,16 +1739,28 @@ static int ocfs2_symlink(struct inode *dir,
struct inode *inode = NULL;
struct super_block *sb;
struct buffer_head *new_fe_bh = NULL;
- struct buffer_head *de_bh = NULL;
struct buffer_head *parent_fe_bh = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_dinode *dirfe;
- struct ocfs2_journal_handle *handle = NULL;
+ handle_t *handle = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
-
- mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
- dentry, symname, dentry->d_name.len, dentry->d_name.name);
+ struct ocfs2_alloc_context *xattr_ac = NULL;
+ int want_clusters = 0;
+ int xattr_credits = 0;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+ int did_quota = 0, did_quota_inode = 0;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
+
+ trace_ocfs2_symlink_begin(dir, dentry, symname,
+ dentry->d_name.len, dentry->d_name.name);
+
+ dquot_initialize(dir);
sb = dir->i_sb;
osb = OCFS2_SB(sb);
@@ -1596,23 +1769,16 @@ static int ocfs2_symlink(struct inode *dir,
credits = ocfs2_calc_symlink_credits(sb);
- handle = ocfs2_alloc_handle(osb);
- if (handle == NULL) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
/* lock the parent directory */
- status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+ status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
- goto bail;
+ return status;
}
dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
- if (!dirfe->i_links_count) {
+ if (!ocfs2_read_links_count(dirfe)) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
goto bail;
@@ -1625,30 +1791,59 @@ static int ocfs2_symlink(struct inode *dir,
status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
dentry->d_name.name,
- dentry->d_name.len, &de_bh);
+ dentry->d_name.len, &lookup);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+ status = ocfs2_reserve_new_inode(osb, &inode_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
- /* don't reserve bitmap space for fast symlinks. */
- if (l > ocfs2_fast_symlink_chars(sb)) {
- status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+ inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* get security xattr */
+ status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ si.enable = 0;
+ else {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ /* calculate meta data/clusters for setting security xattr */
+ if (si.enable) {
+ status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+ &xattr_credits, &xattr_ac);
if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ mlog_errno(status);
goto bail;
}
}
- handle = ocfs2_start_trans(osb, handle, credits);
+ /* don't reserve bitmap space for fast symlinks. */
+ if (l > ocfs2_fast_symlink_chars(sb))
+ want_clusters += 1;
+
+ status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ handle = ocfs2_start_trans(osb, credits + xattr_credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -1656,10 +1851,23 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- status = ocfs2_mknod_locked(osb, dir, dentry,
- S_IFLNK | S_IRWXUGO, 0,
- &new_fe_bh, parent_fe_bh, handle,
- &inode, inode_ac);
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
+ status = dquot_alloc_inode(inode);
+ if (status)
+ goto bail;
+ did_quota_inode = 1;
+
+ trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ inode->i_mode);
+
+ status = ocfs2_mknod_locked(osb, dir, inode,
+ 0, &new_fe_bh, parent_fe_bh, handle,
+ inode_ac);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1668,11 +1876,20 @@ static int ocfs2_symlink(struct inode *dir,
fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
inode->i_rdev = 0;
newsize = l - 1;
+ inode->i_op = &ocfs2_symlink_inode_operations;
if (l > ocfs2_fast_symlink_chars(sb)) {
- inode->i_op = &ocfs2_symlink_inode_operations;
- status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
- handle, data_ac, NULL,
- NULL);
+ u32 offset = 0;
+
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status)
+ goto bail;
+ did_quota = 1;
+ inode->i_mapping->a_ops = &ocfs2_aops;
+ status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+ new_fe_bh,
+ handle, data_ac, NULL,
+ NULL);
if (status < 0) {
if (status != -ENOSPC && status != -EINTR) {
mlog(ML_ERROR,
@@ -1684,9 +1901,9 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
i_size_write(inode, newsize);
- inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
} else {
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
memcpy((char *) fe->id2.i_symlink, symname, l);
i_size_write(inode, newsize);
inode->i_blocks = 0;
@@ -1707,401 +1924,319 @@ static int ocfs2_symlink(struct inode *dir,
}
}
- status = ocfs2_add_entry(handle, dentry, inode,
- le64_to_cpu(fe->i_blkno), parent_fe_bh,
- de_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ if (si.enable) {
+ status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
+ /*
+ * Do this before adding the entry to the directory. We add
+ * also set d_op after success so that ->d_iput() will cleanup
+ * the dentry lock even if ocfs2_add_entry() fails below.
+ */
status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
if (status) {
mlog_errno(status);
goto bail;
}
+ dl = dentry->d_fsdata;
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ le64_to_cpu(fe->i_blkno), parent_fe_bh,
+ &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
insert_inode_hash(inode);
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
bail:
+ if (status < 0 && did_quota)
+ dquot_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status < 0 && did_quota_inode)
+ dquot_free_inode(inode);
if (handle)
- ocfs2_commit_trans(handle);
- if (new_fe_bh)
- brelse(new_fe_bh);
- if (parent_fe_bh)
- brelse(parent_fe_bh);
- if (de_bh)
- brelse(de_bh);
+ ocfs2_commit_trans(osb, handle);
+
+ ocfs2_inode_unlock(dir, 1);
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
+
+ brelse(new_fe_bh);
+ brelse(parent_fe_bh);
+ kfree(si.value);
+ ocfs2_free_dir_lookup_result(&lookup);
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
- if ((status < 0) && inode)
+ if (xattr_ac)
+ ocfs2_free_alloc_context(xattr_ac);
+ if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+ clear_nlink(inode);
iput(inode);
+ }
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_check_dir_entry(struct inode * dir,
- struct ocfs2_dir_entry * de,
- struct buffer_head * bh,
- unsigned long offset)
+static int ocfs2_blkno_stringify(u64 blkno, char *name)
{
- const char *error_msg = NULL;
- const int rlen = le16_to_cpu(de->rec_len);
-
- if (rlen < OCFS2_DIR_REC_LEN(1))
- error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
- error_msg = "rec_len % 4 != 0";
- else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
- error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
- error_msg = "directory entry across blocks";
-
- if (error_msg != NULL)
- mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
- "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
- offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
- de->name_len);
- return error_msg == NULL ? 1 : 0;
+ int status, namelen;
+
+ namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
+ (long long)blkno);
+ if (namelen <= 0) {
+ if (namelen)
+ status = namelen;
+ else
+ status = -EINVAL;
+ mlog_errno(status);
+ goto bail;
+ }
+ if (namelen != OCFS2_ORPHAN_NAMELEN) {
+ status = -EINVAL;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ trace_ocfs2_blkno_stringify(blkno, name, namelen);
+
+ status = 0;
+bail:
+ if (status < 0)
+ mlog_errno(status);
+ return status;
}
-/* we don't always have a dentry for what we want to add, so people
- * like orphan dir can call this instead.
- *
- * If you pass me insert_bh, I'll skip the search of the other dir
- * blocks and put the record in there.
- */
-static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
- struct inode *dir,
- const char *name, int namelen,
- struct inode *inode, u64 blkno,
- struct buffer_head *parent_fe_bh,
- struct buffer_head *insert_bh)
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ struct buffer_head **ret_orphan_dir_bh)
{
- unsigned long offset;
- unsigned short rec_len;
- struct ocfs2_dir_entry *de, *de1;
- struct super_block *sb;
- int retval, status;
+ struct inode *orphan_dir_inode;
+ struct buffer_head *orphan_dir_bh = NULL;
+ int ret = 0;
- mlog_entry_void();
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ osb->slot_num);
+ if (!orphan_dir_inode) {
+ ret = -ENOENT;
+ mlog_errno(ret);
+ return ret;
+ }
- sb = dir->i_sb;
+ mutex_lock(&orphan_dir_inode->i_mutex);
- if (!namelen)
- return -EINVAL;
+ ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (ret < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
- rec_len = OCFS2_DIR_REC_LEN(namelen);
- offset = 0;
- de = (struct ocfs2_dir_entry *) insert_bh->b_data;
- while (1) {
- BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
- /* These checks should've already been passed by the
- * prepare function, but I guess we can leave them
- * here anyway. */
- if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
- retval = -ENOENT;
- goto bail;
- }
- if (ocfs2_match(namelen, name, de)) {
- retval = -EEXIST;
- goto bail;
- }
- if (((le64_to_cpu(de->inode) == 0) &&
- (le16_to_cpu(de->rec_len) >= rec_len)) ||
- (le16_to_cpu(de->rec_len) >=
- (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
- status = ocfs2_journal_access(handle, dir, insert_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- /* By now the buffer is marked for journaling */
- offset += le16_to_cpu(de->rec_len);
- if (le64_to_cpu(de->inode)) {
- de1 = (struct ocfs2_dir_entry *)((char *) de +
- OCFS2_DIR_REC_LEN(de->name_len));
- de1->rec_len =
- cpu_to_le16(le16_to_cpu(de->rec_len) -
- OCFS2_DIR_REC_LEN(de->name_len));
- de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
- de = de1;
- }
- de->file_type = OCFS2_FT_UNKNOWN;
- if (blkno) {
- de->inode = cpu_to_le64(blkno);
- ocfs2_set_de_type(de, inode->i_mode);
- } else
- de->inode = 0;
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
-
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
- dir->i_version++;
- status = ocfs2_journal_dirty(handle, insert_bh);
- retval = 0;
- goto bail;
- }
- offset += le16_to_cpu(de->rec_len);
- de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+ mlog_errno(ret);
+ return ret;
}
- /* when you think about it, the assert above should prevent us
- * from ever getting here. */
- retval = -ENOSPC;
-bail:
+ *ret_orphan_dir = orphan_dir_inode;
+ *ret_orphan_dir_bh = orphan_dir_bh;
- mlog_exit(retval);
- return retval;
+ return 0;
}
-
-/*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
- * previous entry
- */
-static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
- struct inode *dir,
- struct ocfs2_dir_entry *de_del,
- struct buffer_head *bh)
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+ struct buffer_head *orphan_dir_bh,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
{
- struct ocfs2_dir_entry *de, *pde;
- int i, status = -ENOENT;
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
- mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+ ret = ocfs2_blkno_stringify(blkno, name);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
- i = 0;
- pde = NULL;
- de = (struct ocfs2_dir_entry *) bh->b_data;
- while (i < bh->b_size) {
- if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
- status = -EIO;
- mlog_errno(status);
- goto bail;
- }
- if (de == de_del) {
- status = ocfs2_journal_access(handle, dir, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- status = -EIO;
- mlog_errno(status);
- goto bail;
- }
- if (pde)
- pde->rec_len =
- cpu_to_le16(le16_to_cpu(pde->rec_len) +
- le16_to_cpu(de->rec_len));
- else
- de->inode = 0;
- dir->i_version++;
- status = ocfs2_journal_dirty(handle, bh);
- goto bail;
- }
- i += le16_to_cpu(de->rec_len);
- pde = de;
- de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+ ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+ orphan_dir_bh, name,
+ OCFS2_ORPHAN_NAMELEN, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
}
-bail:
- mlog_exit(status);
- return status;
+
+ return 0;
}
-/*
- * Returns 0 if not found, -1 on failure, and 1 on success
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ * ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
*/
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
- struct inode *dir,
- const char *name, int namelen,
- unsigned long offset,
- struct ocfs2_dir_entry **res_dir)
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
{
- struct ocfs2_dir_entry *de;
- char *dlimit, *de_buf;
- int de_len;
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
int ret = 0;
- mlog_entry_void();
-
- de_buf = bh->b_data;
- dlimit = de_buf + dir->i_sb->s_blocksize;
-
- while (de_buf < dlimit) {
- /* this code is executed quadratically often */
- /* do minimal checking `by hand' */
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+ &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
- de = (struct ocfs2_dir_entry *) de_buf;
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+ blkno, name, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
- if (de_buf + namelen <= dlimit &&
- ocfs2_match(namelen, name, de)) {
- /* found a match - just to be sure, do a full check */
- if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
- ret = -1;
- goto bail;
- }
- *res_dir = de;
- ret = 1;
- goto bail;
- }
+ *ret_orphan_dir = orphan_dir_inode;
- /* prevent looping on a bad block */
- de_len = le16_to_cpu(de->rec_len);
- if (de_len <= 0) {
- ret = -1;
- goto bail;
- }
+out:
+ brelse(orphan_dir_bh);
- de_buf += de_len;
- offset += de_len;
+ if (ret) {
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
}
-bail:
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
- struct inode *dir,
- struct ocfs2_dir_entry **res_dir)
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup,
+ struct inode *orphan_dir_inode)
{
- struct super_block *sb;
- struct buffer_head *bh_use[NAMEI_RA_SIZE];
- struct buffer_head *bh, *ret = NULL;
- unsigned long start, block, b;
- int ra_max = 0; /* Number of bh's in the readahead
- buffer, bh_use[] */
- int ra_ptr = 0; /* Current index into readahead
- buffer */
- int num = 0;
- int nblocks, i, err;
-
- mlog_entry_void();
-
- *res_dir = NULL;
- sb = dir->i_sb;
+ struct buffer_head *orphan_dir_bh = NULL;
+ int status = 0;
+ struct ocfs2_dinode *orphan_fe;
+ struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
- nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
- start = OCFS2_I(dir)->ip_dir_start_lookup;
- if (start >= nblocks)
- start = 0;
- block = start;
+ trace_ocfs2_orphan_add_begin(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
-restart:
- do {
- /*
- * We deal with the read-ahead logic here.
- */
- if (ra_ptr >= ra_max) {
- /* Refill the readahead buffer */
- ra_ptr = 0;
- b = block;
- for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
- /*
- * Terminate if we reach the end of the
- * directory and must wrap, or if our
- * search has finished at this block.
- */
- if (b >= nblocks || (num && block == start)) {
- bh_use[ra_max] = NULL;
- break;
- }
- num++;
-
- bh = ocfs2_bread(dir, b++, &err, 1);
- bh_use[ra_max] = bh;
- }
- }
- if ((bh = bh_use[ra_ptr++]) == NULL)
- goto next;
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- /* read error, skip block & hope for the best */
- ocfs2_error(dir->i_sb, "reading directory %llu, "
- "offset %lu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- block);
- brelse(bh);
- goto next;
- }
- i = ocfs2_search_dirblock(bh, dir, name, namelen,
- block << sb->s_blocksize_bits,
- res_dir);
- if (i == 1) {
- OCFS2_I(dir)->ip_dir_start_lookup = block;
- ret = bh;
- goto cleanup_and_exit;
- } else {
- brelse(bh);
- if (i < 0)
- goto cleanup_and_exit;
- }
- next:
- if (++block >= nblocks)
- block = 0;
- } while (block != start);
+ status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
/*
- * If the directory has grown while we were searching, then
- * search the last part of the directory before giving up.
+ * We're going to journal the change of i_flags and i_orphaned_slot.
+ * It's safe anyway, though some callers may duplicate the journaling.
+ * Journaling within the func just make the logic look more
+ * straightforward.
*/
- block = nblocks;
- nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
- if (block < nblocks) {
- start = 0;
- goto restart;
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
}
-cleanup_and_exit:
- /* Clean up the read-ahead blocks */
- for (; ra_ptr < ra_max; ra_ptr++)
- brelse(bh_use[ra_ptr]);
+ /* we're a cluster, and nlink can change on disk from
+ * underneath us... */
+ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, 1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
- mlog_exit_ptr(ret);
- return ret;
-}
+ status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
+ OCFS2_ORPHAN_NAMELEN, inode,
+ OCFS2_I(inode)->ip_blkno,
+ orphan_dir_bh, lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto rollback;
+ }
-static int ocfs2_blkno_stringify(u64 blkno, char *name)
-{
- int status, namelen;
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+ OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
- mlog_entry_void();
+ /* Record which orphan dir our inode now resides
+ * in. delete_inode will use this to determine which orphan
+ * dir to lock. */
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
- namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
- (long long)blkno);
- if (namelen <= 0) {
- if (namelen)
- status = namelen;
- else
- status = -EINVAL;
- mlog_errno(status);
- goto bail;
- }
- if (namelen != OCFS2_ORPHAN_NAMELEN) {
- status = -EINVAL;
- mlog_errno(status);
- goto bail;
+ ocfs2_journal_dirty(handle, fe_bh);
+
+ trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ osb->slot_num);
+
+rollback:
+ if (status < 0) {
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, -1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
}
- mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
- namelen);
+leave:
+ brelse(orphan_dir_bh);
- status = 0;
-bail:
- mlog_exit(status);
return status;
}
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- char *name,
- struct buffer_head **de_bh)
+/* unlike orphan_add, we expect the orphan dir to already be locked here. */
+int ocfs2_orphan_del(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct inode *orphan_dir_inode,
+ struct inode *inode,
+ struct buffer_head *orphan_dir_bh)
{
- struct inode *orphan_dir_inode = NULL;
- struct buffer_head *orphan_dir_bh = NULL;
+ char name[OCFS2_ORPHAN_NAMELEN + 1];
+ struct ocfs2_dinode *orphan_fe;
int status = 0;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
if (status < 0) {
@@ -2109,195 +2244,404 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
goto leave;
}
- orphan_dir_inode = ocfs2_get_system_file_inode(osb,
- ORPHAN_DIR_SYSTEM_INODE,
- osb->slot_num);
- if (!orphan_dir_inode) {
- status = -ENOENT;
+ trace_ocfs2_orphan_del(
+ (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+ name, OCFS2_ORPHAN_NAMELEN);
+
+ /* find it's spot in the orphan directory */
+ status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+ &lookup);
+ if (status) {
mlog_errno(status);
goto leave;
}
- ocfs2_handle_add_inode(handle, orphan_dir_inode);
- status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
+ /* remove it from the orphan directory */
+ status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
- orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, de_bh);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
}
-leave:
- if (orphan_dir_inode)
- iput(orphan_dir_inode);
+ /* do the i_nlink dance! :) */
+ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, -1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
- if (orphan_dir_bh)
- brelse(orphan_dir_bh);
+leave:
+ ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_orphan_add(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *inode,
- struct ocfs2_dinode *fe,
- char *name,
- struct buffer_head *de_bh)
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+ struct buffer_head *dir_bh,
+ char *orphan_name,
+ struct inode **ret_orphan_dir,
+ u64 *ret_di_blkno,
+ struct ocfs2_dir_lookup_result *orphan_insert,
+ struct ocfs2_alloc_context **ret_inode_ac)
{
- struct inode *orphan_dir_inode = NULL;
+ int ret;
+ u64 di_blkno;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct inode *orphan_dir = NULL;
struct buffer_head *orphan_dir_bh = NULL;
- int status = 0;
- struct ocfs2_dinode *orphan_fe;
+ struct ocfs2_alloc_context *inode_ac = NULL;
- mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
- orphan_dir_inode = ocfs2_get_system_file_inode(osb,
- ORPHAN_DIR_SYSTEM_INODE,
- osb->slot_num);
- if (!orphan_dir_inode) {
- status = -ENOENT;
+ /* reserve an inode spot */
+ ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+ &di_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+ di_blkno, orphan_name, orphan_insert);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret == 0) {
+ *ret_orphan_dir = orphan_dir;
+ *ret_di_blkno = di_blkno;
+ *ret_inode_ac = inode_ac;
+ /*
+ * orphan_name and orphan_insert are already up to
+ * date via prepare_orphan_dir
+ */
+ } else {
+ /* Unroll reserve_new_inode* */
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ /* Unroll orphan dir locking */
+ mutex_unlock(&orphan_dir->i_mutex);
+ ocfs2_inode_unlock(orphan_dir, 1);
+ iput(orphan_dir);
+ }
+
+ brelse(orphan_dir_bh);
+
+ return ret;
+}
+
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+ int mode,
+ struct inode **new_inode)
+{
+ int status, did_quota_inode = 0;
+ struct inode *inode = NULL;
+ struct inode *orphan_dir = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *di = NULL;
+ handle_t *handle = NULL;
+ char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+ struct buffer_head *parent_di_bh = NULL;
+ struct buffer_head *new_di_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ u64 uninitialized_var(di_blkno), suballoc_loc;
+ u16 suballoc_bit;
+
+ status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+ orphan_name, &orphan_dir,
+ &di_blkno, &orphan_insert, &inode_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ inode = ocfs2_get_init_inode(dir, mode);
+ if (!inode) {
+ status = -ENOMEM;
mlog_errno(status);
goto leave;
}
- status = ocfs2_read_block(osb,
- OCFS2_I(orphan_dir_inode)->ip_blkno,
- &orphan_dir_bh, OCFS2_BH_CACHED,
- orphan_dir_inode);
- if (status < 0) {
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = dquot_alloc_inode(inode);
+ if (status)
+ goto leave;
+ did_quota_inode = 1;
+
+ status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+ &suballoc_loc,
+ &suballoc_bit, di_blkno);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- /* we're a cluster, and nlink can change on disk from
- * underneath us... */
- orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
- if (S_ISDIR(inode->i_mode))
- le16_add_cpu(&orphan_fe->i_links_count, 1);
- orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
-
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+ clear_nlink(inode);
+ /* do the real work now. */
+ status = __ocfs2_mknod_locked(dir, inode,
+ 0, &new_di_bh, parent_di_bh, handle,
+ inode_ac, di_blkno, suballoc_loc,
+ suballoc_bit);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
- OCFS2_ORPHAN_NAMELEN, inode,
- OCFS2_I(inode)->ip_blkno,
- orphan_dir_bh, de_bh);
+ di = (struct ocfs2_dinode *)new_di_bh->b_data;
+ status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
+ &orphan_insert, orphan_dir);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+ /* get open lock so that only nodes can't remove it from orphan dir. */
+ status = ocfs2_open_lock(inode);
+ if (status < 0)
+ mlog_errno(status);
- /* Record which orphan dir our inode now resides
- * in. delete_inode will use this to determine which orphan
- * dir to lock. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ insert_inode_hash(inode);
+leave:
+ if (status < 0 && did_quota_inode)
+ dquot_free_inode(inode);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
- mlog(0, "Inode %llu orphaned in slot %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
+ if (orphan_dir) {
+ /* This was locked for us in ocfs2_prepare_orphan_dir() */
+ ocfs2_inode_unlock(orphan_dir, 1);
+ mutex_unlock(&orphan_dir->i_mutex);
+ iput(orphan_dir);
+ }
-leave:
- if (orphan_dir_inode)
- iput(orphan_dir_inode);
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
+ iput(inode);
+ }
- if (orphan_dir_bh)
- brelse(orphan_dir_bh);
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
- mlog_exit(status);
+ brelse(new_di_bh);
+
+ if (!status)
+ *new_inode = inode;
+
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
return status;
}
-/* unlike orphan_add, we expect the orphan dir to already be locked here. */
-int ocfs2_orphan_del(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct inode *orphan_dir_inode,
- struct inode *inode,
- struct buffer_head *orphan_dir_bh)
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct inode *inode,
+ struct dentry *dentry)
{
- char name[OCFS2_ORPHAN_NAMELEN + 1];
- struct ocfs2_dinode *orphan_fe;
int status = 0;
- struct buffer_head *target_de_bh = NULL;
- struct ocfs2_dir_entry *target_de = NULL;
+ struct buffer_head *parent_di_bh = NULL;
+ handle_t *handle = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *dir_di, *di;
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry_void();
+ trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+ dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+ status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
if (status < 0) {
- mlog_errno(status);
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ if (!dir_di->i_links_count) {
+ /* can't make a file in a deleted directory. */
+ status = -ENOENT;
goto leave;
}
- mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
- name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- OCFS2_ORPHAN_NAMELEN);
+ status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+ dentry->d_name.len);
+ if (status)
+ goto leave;
+
+ /* get a spot inside the dir. */
+ status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ dentry->d_name.name,
+ dentry->d_name.len, &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
- /* find it's spot in the orphan directory */
- target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
- orphan_dir_inode, &target_de);
- if (!target_de_bh) {
- status = -ENOENT;
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ osb->slot_num);
+ if (!orphan_dir_inode) {
+ status = -EEXIST;
mlog_errno(status);
goto leave;
}
- /* remove it from the orphan directory */
- status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
- target_de_bh);
+ mutex_lock(&orphan_dir_inode->i_mutex);
+
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mlog_errno(status);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
goto leave;
}
- status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_read_inode_block(inode, &di_bh);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto orphan_unlock;
}
- /* do the i_nlink dance! :) */
- orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
- if (S_ISDIR(inode->i_mode))
- le16_add_cpu(&orphan_fe->i_links_count, -1);
- orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+ handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto orphan_unlock;
+ }
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_commit;
}
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+ orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
+ di->i_orphaned_slot = 0;
+ set_nlink(inode, 1);
+ ocfs2_set_links_count(di, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, di_bh);
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ OCFS2_I(inode)->ip_blkno, parent_di_bh,
+ &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ status = ocfs2_dentry_attach_lock(dentry, inode,
+ OCFS2_I(dir)->ip_blkno);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ d_instantiate(dentry, inode);
+ status = 0;
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
leave:
- if (target_de_bh)
- brelse(target_de_bh);
- mlog_exit(status);
+ ocfs2_inode_unlock(dir, 1);
+
+ brelse(di_bh);
+ brelse(parent_di_bh);
+ brelse(orphan_dir_bh);
+
+ ocfs2_free_dir_lookup_result(&lookup);
+
+ if (status)
+ mlog_errno(status);
+
return status;
}
-struct inode_operations ocfs2_dir_iops = {
+const struct inode_operations ocfs2_dir_iops = {
.create = ocfs2_create,
.lookup = ocfs2_lookup,
.link = ocfs2_link,
@@ -2309,4 +2653,12 @@ struct inode_operations ocfs2_dir_iops = {
.rename = ocfs2_rename,
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
+ .permission = ocfs2_permission,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
+ .fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index deaaa97dbf0..e5d059d4f11 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -26,33 +26,20 @@
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
-extern struct inode_operations ocfs2_dir_iops;
+extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
-int ocfs2_check_dir_entry (struct inode *dir,
- struct ocfs2_dir_entry *de,
- struct buffer_head *bh,
- unsigned long offset);
-struct buffer_head *ocfs2_find_entry(const char *name,
- int namelen,
- struct inode *dir,
- struct ocfs2_dir_entry **res_dir);
int ocfs2_orphan_del(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+ handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
struct buffer_head *orphan_dir_bh);
-
-static inline int ocfs2_match(int len,
- const char * const name,
- struct ocfs2_dir_entry *de)
-{
- if (len != de->name_len)
- return 0;
- if (!de->inode)
- return 0;
- return !memcmp(name, de->name, len);
-}
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+ int mode,
+ struct inode **new_inode);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct inode *new_inode,
+ struct dentry *new_dentry);
#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
index 0b499bccec5..dfb313bda5d 100644
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -77,7 +77,7 @@ struct ocfs1_disk_lock
{
/*00*/ __u32 curr_master;
__u8 file_lock;
- __u8 compat_pad[3]; /* Not in orignal definition. Used to
+ __u8 compat_pad[3]; /* Not in original definition. Used to
make the already existing alignment
explicit */
__u64 last_write_time;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 0462a7f4e21..bbec539230f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,40 +30,70 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
+#include <linux/llist.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mutex.h>
+#include <linux/lockdep.h>
+#include <linux/jbd2.h>
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlm/dlmapi.h"
+/* For union ocfs2_dlm_lksb */
+#include "stackglue.h"
#include "ocfs2_fs.h"
-#include "endian.h"
#include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
-struct ocfs2_extent_map {
- u32 em_clusters;
- struct rb_root em_extents;
-};
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
+
+#include "reservations.h"
+
+/* Caching of metadata buffers */
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
* amount of inlined blocks to be stored on an array and grow the
* structure into a rb tree when necessary. */
-#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+#define OCFS2_CACHE_INFO_MAX_ARRAY 2
+
+/* Flags for ocfs2_caching_info */
+enum ocfs2_caching_info_flags {
+ /* Indicates that the metadata cache is using the inline array */
+ OCFS2_CACHE_FL_INLINE = 1<<1,
+};
+
+struct ocfs2_caching_operations;
struct ocfs2_caching_info {
+ /*
+ * The parent structure provides the locks, but because the
+ * parent structure can differ, it provides locking operations
+ * to struct ocfs2_caching_info.
+ */
+ const struct ocfs2_caching_operations *ci_ops;
+
+ /* next two are protected by trans_inc_lock */
+ /* which transaction were we created on? Zero if none. */
+ unsigned long ci_created_trans;
+ /* last transaction we were a part of. */
+ unsigned long ci_last_trans;
+
+ /* Cache structures */
+ unsigned int ci_flags;
unsigned int ci_num_cached;
union {
- sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+ sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
struct rb_root ci_tree;
} ci_cache;
};
+/*
+ * Need this prototype here instead of in uptodate.h because journal.h
+ * uses it.
+ */
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
/* this limits us to 256 nodes
* if we need more, we can do a kmalloc for the map */
@@ -88,7 +118,7 @@ enum ocfs2_unlock_action {
};
/* ocfs2_lock_res->l_flags flags. */
-#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
+#define OCFS2_LOCK_ATTACHED (0x00000001) /* we have initialized
* the lvb */
#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
* dlm_lock */
@@ -105,36 +135,88 @@ enum ocfs2_unlock_action {
* about to be
* dropped. */
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
+ call to dlm_lock. Only
+ exists with BUSY set. */
+#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
+ * from downconverting
+ * before the upconvert
+ * has completed */
struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+ u64 ls_total; /* Total wait in NSEC */
+ u32 ls_gets; /* Num acquires */
+ u32 ls_fail; /* Num failed acquires */
+
+ /* Storing max wait in usecs saves 24 bytes per inode */
+ u32 ls_max; /* Max wait in USEC */
+};
+#endif
+
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
- spinlock_t l_lock;
+
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
- enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
- int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct dlm_lockstatus l_lksb;
+ signed char l_level;
+ signed char l_requested;
+ signed char l_blocking;
+
+ /* Data packed - type enum ocfs2_lock_type */
+ unsigned char l_type;
/* used from AST/BAST funcs. */
- enum ocfs2_ast_action l_action;
- enum ocfs2_unlock_action l_unlock_action;
- int l_requested;
- int l_blocking;
+ /* Data packed - enum type ocfs2_ast_action */
+ unsigned char l_action;
+ /* Data packed - enum type ocfs2_unlock_action */
+ unsigned char l_unlock_action;
+ unsigned int l_pending_gen;
+
+ spinlock_t l_lock;
+
+ struct ocfs2_dlm_lksb l_lksb;
wait_queue_head_t l_event;
struct list_head l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+ struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */
+ u32 l_lock_refresh; /* Disk refreshes */
+ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map l_lockdep_map;
+#endif
+};
+
+enum ocfs2_orphan_scan_state {
+ ORPHAN_SCAN_ACTIVE,
+ ORPHAN_SCAN_INACTIVE
+};
+
+struct ocfs2_orphan_scan {
+ struct mutex os_lock;
+ struct ocfs2_super *os_osb;
+ struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
+ struct delayed_work os_orphan_scan_work;
+ struct timespec os_scantime; /* time this node ran the scan */
+ u32 os_count; /* tracks node specific scans */
+ u32 os_seqno; /* tracks cluster wide scans */
+ atomic_t os_state; /* ACTIVE or INACTIVE */
};
struct ocfs2_dlm_debug {
@@ -147,6 +229,7 @@ enum ocfs2_vol_state
{
VOLUME_INIT = 0,
VOLUME_MOUNTED,
+ VOLUME_MOUNTED_QUOTAS,
VOLUME_DISMOUNTED,
VOLUME_DISABLED
};
@@ -162,40 +245,60 @@ struct ocfs2_alloc_stats
enum ocfs2_local_alloc_state
{
- OCFS2_LA_UNUSED = 0,
- OCFS2_LA_ENABLED,
- OCFS2_LA_DISABLED
+ OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for
+ * this mountpoint. */
+ OCFS2_LA_ENABLED, /* Local alloc is in use. */
+ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number
+ * of bits has been reduced. */
+ OCFS2_LA_DISABLED /* Local alloc has temporarily been
+ * disabled. */
};
enum ocfs2_mount_options
{
- OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
+ OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
+ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
+ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
+ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access
+ control lists */
+ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+ writes */
+ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
#define OCFS2_OSB_SOFT_RO 0x0001
#define OCFS2_OSB_HARD_RO 0x0002
#define OCFS2_OSB_ERROR_FS 0x0004
+#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
-struct ocfs2_journal_handle;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
+struct ocfs2_quota_recovery;
struct ocfs2_super
{
struct task_struct *commit_task;
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
- struct inode *system_inodes[NUM_SYSTEM_INODES];
+ struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+ struct inode **local_system_inodes;
struct ocfs2_slot_info *slot_info;
+ u32 *slot_recovery_generations;
+
spinlock_t node_map_lock;
- struct ocfs2_node_map mounted_map;
- struct ocfs2_node_map recovery_map;
- struct ocfs2_node_map umount_map;
u64 root_blkno;
u64 system_dir_blkno;
@@ -203,6 +306,7 @@ struct ocfs2_super
u32 bitmap_cpg;
u8 *uuid;
char *uuid_str;
+ u32 uuid_hash;
u8 *vol_label;
u64 first_cluster_group_blkno;
u32 fs_generation;
@@ -211,87 +315,149 @@ struct ocfs2_super
u32 s_feature_incompat;
u32 s_feature_ro_compat;
- /* Protects s_next_generaion, osb_flags. Could protect more on
- * osb as it's very short lived. */
+ /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
+ * Could protect more on osb as it's very short lived.
+ */
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
+ s16 s_inode_steal_slot;
+ s16 s_meta_steal_slot;
+ atomic_t s_num_inodes_stolen;
+ atomic_t s_num_meta_stolen;
unsigned long s_mount_opt;
+ unsigned int s_atime_quantum;
- u16 max_slots;
- s16 node_num;
- s16 slot_num;
+ unsigned int max_slots;
+ unsigned int node_num;
+ int slot_num;
+ int preferred_slot;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
+ unsigned int s_xattr_inline_size;
atomic_t vol_state;
struct mutex recovery_lock;
+ struct ocfs2_recovery_map *recovery_map;
+ struct ocfs2_replay_map *replay_map;
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
- atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
+ unsigned long osb_commit_interval;
+
+ struct delayed_work la_enable_wq;
+
+ /*
+ * Must hold local alloc i_mutex and osb->osb_lock to change
+ * local_alloc_bits. Reads can be done under either lock.
+ */
+ unsigned int local_alloc_bits;
+ unsigned int local_alloc_default_bits;
+ /* osb_clusters_at_boot can become stale! Do not trust it to
+ * be up to date. */
+ unsigned int osb_clusters_at_boot;
+
+ enum ocfs2_local_alloc_state local_alloc_state; /* protected
+ * by osb_lock */
- enum ocfs2_local_alloc_state local_alloc_state;
struct buffer_head *local_alloc_bh;
+
u64 la_last_gd;
- /* Next two fields are for local node slot recovery during
+ struct ocfs2_reservation_map osb_la_resmap;
+
+ unsigned int osb_resv_level;
+ unsigned int osb_dir_resv_level;
+
+ /* Next three fields are for local node slot recovery during
* mount. */
int dirty;
struct ocfs2_dinode *local_alloc_copy;
+ struct ocfs2_quota_recovery *quota_rec;
+ struct ocfs2_blockcheck_stats osb_ecc_stats;
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
- struct dlm_ctxt *dlm;
+ u8 osb_stackflags;
+
+ char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
+ struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
- struct dlm_eviction_cb osb_eviction_cb;
+ struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
+ struct dentry *osb_ctxt;
wait_queue_head_t recovery_event;
- spinlock_t vote_task_lock;
- struct task_struct *vote_task;
- wait_queue_head_t vote_event;
- unsigned long vote_wake_sequence;
- unsigned long vote_work_sequence;
-
+ spinlock_t dc_task_lock;
+ struct task_struct *dc_task;
+ wait_queue_head_t dc_event;
+ unsigned long dc_wake_sequence;
+ unsigned long dc_work_sequence;
+
+ /*
+ * Any thread can add locks to the list, but the downconvert
+ * thread is the only one allowed to remove locks. Any change
+ * to this rule requires updating
+ * ocfs2_downconvert_thread_do_work().
+ */
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
- struct list_head vote_list;
- int vote_count;
-
- u32 net_key;
- spinlock_t net_response_lock;
- unsigned int net_response_ids;
- struct list_head net_response_list;
-
- struct o2hb_callback_func osb_hb_up;
- struct o2hb_callback_func osb_hb_down;
-
- struct list_head osb_net_handlers;
+ /* List of dquot structures to drop last reference to */
+ struct llist_head dquot_drop_list;
+ struct work_struct dquot_drop_work;
wait_queue_head_t osb_mount_event;
/* Truncate log info */
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
- struct work_struct osb_truncate_log_wq;
+ struct delayed_work osb_truncate_log_wq;
+ atomic_t osb_tl_disable;
+ /*
+ * How many clusters in our truncate log.
+ * It must be protected by osb_tl_inode->i_mutex.
+ */
+ unsigned int truncated_clusters;
struct ocfs2_node_map osb_recovering_orphan_dirs;
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
+
+ struct ocfs2_orphan_scan osb_orphan_scan;
+
+ /* used to protect metaecc calculation check of xattr. */
+ spinlock_t osb_xattr_lock;
+
+ unsigned int osb_dx_mask;
+ u32 osb_dx_seed[4];
+
+ /* the group we used to allocate inodes. */
+ u64 osb_inode_alloc_group;
+
+ /* rb tree root for refcount lock. */
+ struct rb_root osb_rf_lock_tree;
+ struct ocfs2_refcount_tree *osb_ref_tree_lru;
+
+ struct mutex system_file_mutex;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+
static inline int ocfs2_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
@@ -301,6 +467,106 @@ static inline int ocfs2_should_order_data(struct inode *inode)
return 1;
}
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+ /*
+ * Support for sparse files is a pre-requisite
+ */
+ if (!ocfs2_sparse_alloc(osb))
+ return 0;
+
+ if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+ return 1;
+ return 0;
+}
+
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+ if (ocfs2_supports_indexed_dirs(osb))
+ return OCFS2_DX_LINK_MAX;
+ return OCFS2_LINK_MAX;
+}
+
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+ u32 nlink = le16_to_cpu(di->i_links_count);
+ u32 hi = le16_to_cpu(di->i_links_count_hi);
+
+ if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+ nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+
+ return nlink;
+}
+
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+ u16 lo, hi;
+
+ lo = nlink;
+ hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+
+ di->i_links_count = cpu_to_le16(lo);
+ di->i_links_count_hi = cpu_to_le16(hi);
+}
+
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+ u32 links = ocfs2_read_links_count(di);
+
+ links += n;
+
+ ocfs2_set_links_count(di, links);
+}
+
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -347,38 +613,73 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
return ret;
}
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
+{
+ return (osb->s_feature_incompat &
+ (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+ OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+ return ocfs2_o2cb_stack(osb) &&
+ (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
+}
+
+static inline int ocfs2_mount_local(struct ocfs2_super *osb)
+{
+ return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
+}
+
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+ return (osb->s_feature_incompat &
+ OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
- typeof(__di) ____di = (__di); \
- ocfs2_error((__sb), \
- "Dinode # %llu has bad signature %.*s", \
- (unsigned long long)(____di)->i_blkno, 7, \
- (____di)->i_signature); \
-} while (0);
-
#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
- typeof(__eb) ____eb = (__eb); \
- ocfs2_error((__sb), \
- "Extent Block # %llu has bad signature %.*s", \
- (unsigned long long)(____eb)->h_blkno, 7, \
- (____eb)->h_signature); \
-} while (0);
-
#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
- typeof(__gd) ____gd = (__gd); \
- ocfs2_error((__sb), \
- "Group Descriptor # %llu has bad signature %.*s", \
- (unsigned long long)(____gd)->bg_blkno, 7, \
- (____gd)->bg_signature); \
-} while (0);
+
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
+ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
+ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_ROOT(ptr) \
+ (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_LEAF(ptr) \
+ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
+ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
@@ -430,6 +731,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
}
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+ u64 blocks)
+{
+ int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+ unsigned int clusters;
+
+ clusters = ocfs2_blocks_to_clusters(sb, blocks);
+ return (u64)clusters << bits;
+}
+
static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
u64 bytes)
{
@@ -454,9 +765,124 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
return (unsigned long)((bytes + 511) >> 9);
}
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
-#define ocfs2_test_bit ext2_test_bit
-#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+ unsigned long pg_index)
+{
+ u32 clusters = pg_index;
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+ if (unlikely(PAGE_CACHE_SHIFT > cbits))
+ clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+ else if (PAGE_CACHE_SHIFT < cbits)
+ clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+ return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
+ u32 clusters)
+{
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+ pgoff_t index = clusters;
+
+ if (PAGE_CACHE_SHIFT > cbits) {
+ index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
+ } else if (PAGE_CACHE_SHIFT < cbits) {
+ index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+ }
+
+ return index;
+}
+
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned int pages_per_cluster = 1;
+
+ if (PAGE_CACHE_SHIFT < cbits)
+ pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+ return pages_per_cluster;
+}
+
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+ unsigned int megs)
+{
+ BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+ return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+ unsigned int clusters)
+{
+ return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+ __set_bit_le(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+ __clear_bit_le(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
+
+#define ocfs2_test_bit test_bit_le
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
+#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+ int start)
+{
+ int fix = 0, ret, tmpmax;
+ bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3330a5dc6be..938387a10d5 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,11 @@
#define OCFS2_INODE_SIGNATURE "INODE01"
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -85,9 +90,22 @@
#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
-#define OCFS2_FEATURE_COMPAT_SUPP 0
-#define OCFS2_FEATURE_INCOMPAT_SUPP 0
-#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
+#define OCFS2_FEATURE_COMPAT_SUPP (OCFS2_FEATURE_COMPAT_BACKUP_SB \
+ | OCFS2_FEATURE_COMPAT_JBD2_SB)
+#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
+ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+ | OCFS2_FEATURE_INCOMPAT_XATTR \
+ | OCFS2_FEATURE_INCOMPAT_META_ECC \
+ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -96,6 +114,103 @@
*/
#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
+/*
+ * tunefs sets this incompat flag before starting the resize and clears it
+ * at the end. This flag protects users from inadvertently mounting the fs
+ * after an aborted run without fsck-ing.
+ */
+#define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004
+
+/* Used to denote a non-clustered volume */
+#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008
+
+/* Support for sparse allocation in b-trees */
+#define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010
+
+/*
+ * Tunefs sets this incompat flag before starting an operation which
+ * would require cleanup on abort. This is done to protect users from
+ * inadvertently mounting the fs after an aborted run without
+ * fsck-ing.
+ *
+ * s_tunefs_flags on the super block describes precisely which
+ * operations were in progress.
+ */
+#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020
+
+/* Support for data packed into inode blocks */
+#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
+
+/*
+ * Support for alternate, userspace cluster stacks. If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used. This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
+
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
+
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400
+
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
+
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
+
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
+
+/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
+
+/*
+ * backup superblock flag is used to indicate that this volume
+ * has backup superblocks.
+ */
+#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
+
+/*
+ * The filesystem will correctly handle journal feature bits.
+ */
+#define OCFS2_FEATURE_COMPAT_JBD2_SB 0x0002
+
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
+
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
+
+/* The byte offset of the first backup block will be 1G.
+ * The following will be 4G, 16G, 64G, 256G and 1T.
+ */
+#define OCFS2_BACKUP_SB_START 1 << 30
+
+/* the max backup superblock nums */
+#define OCFS2_MAX_BACKUP_SUPERBLOCKS 6
+
+/*
+ * Flags on ocfs2_super_block.s_tunefs_flags
+ */
+#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */
/*
* Flags on ocfs2_dinode.i_flags
@@ -113,26 +228,55 @@
#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
+#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
-/* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
-#define OCFS2_UNRM_FL (0x00000002) /* Undelete */
-#define OCFS2_COMPR_FL (0x00000004) /* Compress file */
-#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */
-#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */
-#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */
-#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */
-#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */
+/*
+ * Flags on ocfs2_dinode.i_dyn_features
+ *
+ * These can change much more often than i_flags. When adding flags,
+ * keep in mind that i_dyn_features is only 16 bits wide.
+ */
+#define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */
+#define OCFS2_HAS_XATTR_FL (0x0002)
+#define OCFS2_INLINE_XATTR_FL (0x0004)
+#define OCFS2_INDEXED_DIR_FL (0x0008)
+#define OCFS2_HAS_REFCOUNT_FL (0x0010)
-#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */
-#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
+/* Inode attributes, keep in sync with EXT2 */
+#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
+#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
+#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
+#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
+#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
+#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
+#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
+#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
+#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
+#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
/*
- * ioctl commands
+ * Extent record flags (e_node.leaf.flags)
*/
-#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
+#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
+ * unwritten */
+#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference
+ * counted in an associated
+ * refcount tree */
/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
@@ -156,9 +300,27 @@
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
+/* The cluster stack fields */
+#define OCFS2_STACK_LABEL_LEN 4
+#define OCFS2_CLUSTER_NAME_LEN 16
+
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
+
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -173,15 +335,24 @@ enum {
#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+ USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
JOURNAL_SYSTEM_INODE,
LOCAL_ALLOC_SYSTEM_INODE,
TRUNCATE_LOG_SYSTEM_INODE,
+ LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
NUM_SYSTEM_INODES
};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES \
+ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Global system inodes (single copy) */
@@ -193,6 +364,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
[SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
[HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
[GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
+ [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+ [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
/* Slot-specific system inodes (one copy per slot) */
[ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -200,12 +373,15 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
[INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
[JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
[LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
- [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+ [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+ [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+ [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
};
/* Parameter passed from mount.ocfs2 to module */
#define OCFS2_HB_NONE "heartbeat=none"
#define OCFS2_HB_LOCAL "heartbeat=local"
+#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
* OCFS2 directory file types. Only the low 3 bits are used. The
@@ -233,8 +409,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
OCFS2_DIR_ROUND) & \
~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1)
#define OCFS2_LINK_MAX 32000
+#define OCFS2_DX_LINK_MAX ((1U << 31) - 1U)
+#define OCFS2_LINKS_HI_SHIFT 16
+#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
#define S_SHIFT 12
static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -254,12 +434,39 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
/*
+ * Block checking structure. This is used in metadata to validate the
+ * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
+ __le16 bc_ecc; /* Single-error-correction parity vector.
+ This is a simple Hamming code dependent
+ on the blocksize. OCFS2's maximum
+ blocksize, 4K, requires 16 parity bits,
+ so we fit in __le16. */
+ __le16 bc_reserved1;
+/*08*/
+};
+
+/*
* On disk extent record for OCFS2
* It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
*/
struct ocfs2_extent_rec {
/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
- __le32 e_clusters; /* Clusters covered by this extent */
+ union {
+ __le32 e_int_clusters; /* Clusters covered by all children */
+ struct {
+ __le16 e_leaf_clusters; /* Clusters covered by this
+ extent */
+ __u8 e_reserved1;
+ __u8 e_flags; /* Extent flags */
+ };
+ };
__le64 e_blkno; /* Physical disk offset, in blocks */
/*10*/
};
@@ -285,7 +492,10 @@ struct ocfs2_extent_list {
/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
point. 0 means data extents
hang directly off this
- header (a leaf) */
+ header (a leaf)
+ NOTE: The high 8 bits cannot be
+ used - tree_depth is never that big.
+ */
__le16 l_count; /* Number of extent records */
__le16 l_next_free_rec; /* Next unused extent slot */
__le16 l_reserved1;
@@ -326,14 +536,17 @@ struct ocfs2_truncate_log {
struct ocfs2_extent_block
{
/*00*/ __u8 h_signature[8]; /* Signature for verification */
- __le64 h_reserved1;
+ struct ocfs2_block_check h_check; /* Error checking */
/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
extent_header belongs to */
__le16 h_suballoc_bit; /* Bit offset in suballocator
block group */
__le32 h_fs_generation; /* Must match super block */
__le64 h_blkno; /* Offset on disk, in blocks */
-/*20*/ __le64 h_reserved3;
+/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
+ eb belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
__le64 h_next_leaf_blk; /* Offset on disk, in blocks,
of next leaf header pointing
to data */
@@ -342,6 +555,59 @@ struct ocfs2_extent_block
};
/*
+ * On disk slot map for OCFS2. This defines the contents of the "slot_map"
+ * system file. A slot is valid if it contains a node number >= 0. The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/ __le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
+struct ocfs2_extended_slot {
+/*00*/ __u8 es_valid;
+ __u8 es_reserved1[3];
+ __le32 es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set. It separates out the valid marker from the node number, and
+ * has room to grow. Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file. It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
+struct ocfs2_cluster_info {
+/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
+ union {
+ __le32 ci_reserved;
+ struct {
+ __u8 ci_stackflags;
+ __u8 ci_reserved1;
+ __u8 ci_reserved2;
+ __u8 ci_reserved3;
+ };
+ };
+/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+
+/*
* On disk superblock for OCFS2
* Note that it is contained inside an ocfs2_dinode, so all offsets
* are relative to the start of ocfs2_dinode.id2.
@@ -367,13 +633,31 @@ struct ocfs2_super_block {
__le32 s_clustersize_bits; /* Clustersize for this fs */
/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
before tunefs required */
- __le16 s_reserved1;
- __le32 s_reserved2;
+ __le16 s_tunefs_flag;
+ __le32 s_uuid_hash; /* hash value of uuid */
__le64 s_first_cluster_group; /* Block offset of 1st cluster
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
-/*A0*/
+/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+ userspace or clusterinfo
+ INCOMPAT flag set. */
+/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
+ for this fs*/
+ __le16 s_reserved0;
+ __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash.
+ * s_uuid_hash serves as seed[3]. */
+/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */
+/*140*/
+
+ /*
+ * NOTE: As stated above, all offsets are relative to
+ * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+ * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within
+ * our smallest blocksize, which is 512 bytes. To ensure this,
+ * we reserve the space in s_reserved2. Anything past s_reserved2
+ * will not be available on the smallest blocksize.
+ */
};
/*
@@ -391,6 +675,19 @@ struct ocfs2_local_alloc
};
/*
+ * Data-in-inode header. This is only used if i_dyn_features has
+ * OCFS2_INLINE_DATA_FL set.
+ */
+struct ocfs2_inline_data
+{
+/*00*/ __le16 id_count; /* Number of bytes that can be used
+ * for data, starting at id_data */
+ __le16 id_reserved0;
+ __le32 id_reserved1;
+ __u8 id_data[0]; /* Start of user data */
+};
+
+/*
* On disk inode for OCFS2
*/
struct ocfs2_dinode {
@@ -400,7 +697,8 @@ struct ocfs2_dinode {
belongs to */
__le16 i_suballoc_bit; /* Bit offset in suballocator
block group */
-/*10*/ __le32 i_reserved0;
+/*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */
+ __le16 i_xattr_inline_size;
__le32 i_clusters; /* Cluster count */
__le32 i_uid; /* Owner UID */
__le32 i_gid; /* Owning GID */
@@ -419,9 +717,19 @@ struct ocfs2_dinode {
__le32 i_atime_nsec;
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
- __le32 i_attr;
- __le32 i_reserved1;
-/*70*/ __le64 i_reserved2[8];
+/*70*/ __le32 i_attr;
+ __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
+ was set in i_flags */
+ __le16 i_dyn_features;
+ __le64 i_xattr_loc;
+/*80*/ struct ocfs2_block_check i_check; /* Error checking */
+/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
+/*90*/ __le64 i_refcount_loc;
+ __le64 i_suballoc_loc; /* Suballocator block group this
+ inode belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
+/*A0*/ __le64 i_reserved2[3];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -437,15 +745,19 @@ struct ocfs2_dinode {
struct { /* Info for journal system
inodes */
__le32 ij_flags; /* Mounted, version, etc. */
- __le32 ij_pad;
+ __le32 ij_recovery_generation; /* Incremented when the
+ journal is recovered
+ after an unclean
+ shutdown */
} journal1;
- } id1; /* Inode type dependant 1 */
+ } id1; /* Inode type dependent 1 */
/*C0*/ union {
struct ocfs2_super_block i_super;
struct ocfs2_local_alloc i_lab;
struct ocfs2_chain_list i_chain;
struct ocfs2_extent_list i_list;
struct ocfs2_truncate_log i_dealloc;
+ struct ocfs2_inline_data i_data;
__u8 i_symlink[0];
} id2;
/* Actual on-disk size is one block */
@@ -466,6 +778,130 @@ struct ocfs2_dir_entry {
} __attribute__ ((packed));
/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */
+
+ __le16 db_compat_rec_len; /* Backwards compatible with
+ * ocfs2_dir_entry. */
+ __u8 db_compat_name_len; /* Always zero. Was name_len */
+ __u8 db_reserved0;
+ __le16 db_reserved1;
+ __le16 db_free_rec_len; /* Size of largest empty hole
+ * in this block. (unused) */
+/*10*/ __u8 db_signature[8]; /* Signature for verification */
+ __le64 db_reserved2;
+ __le64 db_free_next; /* Next block in list (unused) */
+/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
+ __le64 db_parent_dinode; /* dinode which owns me, in
+ blocks */
+/*30*/ struct ocfs2_block_check db_check; /* Error checking */
+/*40*/
+};
+
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+ __le32 dx_major_hash; /* Used to find logical
+ * cluster in index */
+ __le32 dx_minor_hash; /* Lower bits used to find
+ * block in cluster */
+ __le64 dx_dirent_blk; /* Physical block in unindexed
+ * tree holding this dirent. */
+};
+
+struct ocfs2_dx_entry_list {
+ __le32 de_reserved;
+ __le16 de_count; /* Maximum number of entries
+ * possible in de_entries */
+ __le16 de_num_used; /* Current number of
+ * de_entries entries */
+ struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries
+ * in a packed array of
+ * length de_num_used */
+};
+
+#define OCFS2_DX_FLAG_INLINE 0x01
+
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+ __u8 dr_signature[8]; /* Signature for verification */
+ struct ocfs2_block_check dr_check; /* Error checking */
+ __le16 dr_suballoc_slot; /* Slot suballocator this
+ * block belongs to. */
+ __le16 dr_suballoc_bit; /* Bit offset in suballocator
+ * block group */
+ __le32 dr_fs_generation; /* Must match super block */
+ __le64 dr_blkno; /* Offset on disk, in blocks */
+ __le64 dr_last_eb_blk; /* Pointer to last
+ * extent block */
+ __le32 dr_clusters; /* Clusters allocated
+ * to the indexed tree. */
+ __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */
+ __u8 dr_reserved0;
+ __le16 dr_reserved1;
+ __le64 dr_dir_blkno; /* Pointer to parent inode */
+ __le32 dr_num_entries; /* Total number of
+ * names stored in
+ * this directory.*/
+ __le32 dr_reserved2;
+ __le64 dr_free_blk; /* Pointer to head of free
+ * unindexed block list. */
+ __le64 dr_suballoc_loc; /* Suballocator block group
+ this root belongs to.
+ Only valid if allocated
+ from a discontiguous
+ block group */
+ __le64 dr_reserved3[14];
+ union {
+ struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+ * bits for maximum space
+ * efficiency. */
+ struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+ * entries. We grow out
+ * to extents if this
+ * gets too big. */
+ };
+};
+
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+ __u8 dl_signature[8];/* Signature for verification */
+ struct ocfs2_block_check dl_check; /* Error checking */
+ __le64 dl_blkno; /* Offset on disk, in blocks */
+ __le32 dl_fs_generation;/* Must match super block */
+ __le32 dl_reserved0;
+ __le64 dl_reserved1;
+ struct ocfs2_dx_entry_list dl_list;
+};
+
+/*
+ * Largest bitmap for a block (suballocator) group in bytes. This limit
+ * does not affect cluster groups (global allocator). Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE 256
+
+/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
@@ -484,10 +920,352 @@ struct ocfs2_group_desc
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
blocks */
__le64 bg_blkno; /* Offset on disk, in blocks */
-/*30*/ __le64 bg_reserved2[2];
-/*40*/ __u8 bg_bitmap[0];
+/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
+ __le64 bg_reserved2;
+/*40*/ union {
+ __u8 bg_bitmap[0];
+ struct {
+ /*
+ * Block groups may be discontiguous when
+ * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+ * The extents of a discontigous block group are
+ * stored in bg_list. It is a flat list.
+ * l_tree_depth must always be zero. A
+ * discontiguous group is signified by a non-zero
+ * bg_list->l_next_free_rec. Only block groups
+ * can be discontiguous; Cluster groups cannot.
+ * We've never made a block group with more than
+ * 2048 blocks (256 bytes of bg_bitmap). This
+ * codifies that limit so that we can fit bg_list.
+ * bg_size of a discontiguous block group will
+ * be 256 to match bg_bitmap_filler.
+ */
+ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/ struct ocfs2_extent_list bg_list;
+ };
+ };
+/* Actual on-disk size is one block */
+};
+
+struct ocfs2_refcount_rec {
+/*00*/ __le64 r_cpos; /* Physical offset, in clusters */
+ __le32 r_clusters; /* Clusters covered by this extent */
+ __le32 r_refcount; /* Reference count of this extent */
+/*10*/
+};
+#define OCFS2_32BIT_POS_MASK (0xffffffffULL)
+
+#define OCFS2_REFCOUNT_LEAF_FL (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/ __le16 rl_count; /* Maximum number of entries possible
+ in rl_records */
+ __le16 rl_used; /* Current number of used records */
+ __le32 rl_reserved2;
+ __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/ __u8 rf_signature[8]; /* Signature for verification */
+ __le16 rf_suballoc_slot; /* Slot suballocator this block
+ belongs to */
+ __le16 rf_suballoc_bit; /* Bit offset in suballocator
+ block group */
+ __le32 rf_fs_generation; /* Must match superblock */
+/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */
+ __le64 rf_parent; /* Parent block, only valid if
+ OCFS2_REFCOUNT_LEAF_FL is set in
+ rf_flags */
+/*20*/ struct ocfs2_block_check rf_check; /* Error checking */
+ __le64 rf_last_eb_blk; /* Pointer to last extent block */
+/*30*/ __le32 rf_count; /* Number of inodes sharing this
+ refcount tree */
+ __le32 rf_flags; /* See the flags above */
+ __le32 rf_clusters; /* clusters covered by refcount tree. */
+ __le32 rf_cpos; /* cluster offset in refcount tree.*/
+/*40*/ __le32 rf_generation; /* generation number. all be the same
+ * for the same refcount tree. */
+ __le32 rf_reserved0;
+ __le64 rf_suballoc_loc; /* Suballocator block group this
+ refcount block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
+/*50*/ __le64 rf_reserved1[6];
+/*80*/ union {
+ struct ocfs2_refcount_list rf_records; /* List of refcount
+ records */
+ struct ocfs2_extent_list rf_list; /* Extent record list,
+ only valid if
+ OCFS2_REFCOUNT_TREE_FL
+ is set in rf_flags */
+ };
+/* Actual on-disk size is one block */
+};
+
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+ __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */
+ __le16 xe_name_offset; /* byte offset from the 1st entry in the
+ local xattr storage(inode, xattr block or
+ xattr bucket). */
+ __u8 xe_name_len; /* xattr name len, doesn't include prefix. */
+ __u8 xe_type; /* the low 7 bits indicate the name prefix
+ * type and the highest bit indicates whether
+ * the EA is stored in the local storage. */
+ __le64 xe_value_size; /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+ __le16 xh_count; /* contains the count of how
+ many records are in the
+ local xattr storage. */
+ __le16 xh_free_start; /* current offset for storing
+ xattr. */
+ __le16 xh_name_value_len; /* total length of name/value
+ length in this bucket. */
+ __le16 xh_num_buckets; /* Number of xattr buckets
+ in this extent record,
+ only valid in the first
+ bucket. */
+ struct ocfs2_block_check xh_check; /* Error checking
+ (Note, this is only
+ used for xattr
+ buckets. A block uses
+ xb_check and sets
+ this field to zero.) */
+ struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * When an xattr's value is large enough, it is stored in an external
+ * b-tree like file data. The xattr value root points to this structure.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */
+ __le32 xr_reserved0;
+ __le64 xr_last_eb_blk; /* Pointer to last extent block */
+/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */
};
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */
+ __le32 xt_reserved0;
+ __le64 xt_last_eb_blk; /* Pointer to last extent block */
+/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED 0x1
+#define OCFS2_HASH_SHIFT 5
+#define OCFS2_XATTR_ROUND 3
+#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \
+ ~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE 4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \
+ / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/ __u8 xb_signature[8]; /* Signature for verification */
+ __le16 xb_suballoc_slot; /* Slot suballocator this
+ block belongs to. */
+ __le16 xb_suballoc_bit; /* Bit offset in suballocator
+ block group */
+ __le32 xb_fs_generation; /* Must match super block */
+/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
+ struct ocfs2_block_check xb_check; /* Error checking */
+/*20*/ __le16 xb_flags; /* Indicates whether this block contains
+ real xattr or a xattr tree. */
+ __le16 xb_reserved0;
+ __le32 xb_reserved1;
+ __le64 xb_suballoc_loc; /* Suballocator block group this
+ xattr block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
+/*30*/ union {
+ struct ocfs2_xattr_header xb_header; /* xattr header if this
+ block contains xattr */
+ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+ block cotains xattr
+ tree. */
+ } xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL 0x80
+#define OCFS2_XATTR_TYPE_MASK 0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+ int local)
+{
+ if (local)
+ xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+ else
+ xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+ return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+ xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+ return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
+/*
+ * On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+ 0x0cf52470, /* USRQUOTA */ \
+ 0x0cf52471 /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+ 0, \
+ 0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+ __le32 dqh_magic; /* Magic number identifying file */
+ __le32 dqh_version; /* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */
+ __le32 dqi_igrace; /* Grace time for inode softlimit excess */
+ __le32 dqi_syncms; /* Time after which we sync local changes to
+ * global quota file */
+ __le32 dqi_blocks; /* Number of blocks in quota file */
+/*10*/ __le32 dqi_free_blk; /* First free block in quota file */
+ __le32 dqi_free_entry; /* First block with free dquot entry in quota
+ * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/ __le32 dqb_id; /* ID the structure belongs to */
+ __le32 dqb_use_count; /* Number of nodes having reference to this structure */
+ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
+/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */
+ __le64 dqb_curinodes; /* current # allocated inodes */
+/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */
+ __le64 dqb_bsoftlimit; /* preferred limit on disk space */
+/*30*/ __le64 dqb_curspace; /* current space occupied */
+ __le64 dqb_btime; /* time limit for excessive disk use */
+/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */
+ __le64 dqb_pad1;
+/*50*/ __le64 dqb_pad2;
+};
+
+/*
+ * On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+ 0x0cf524c0, /* USRQUOTA */ \
+ 0x0cf524c1 /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+ 0, \
+ 0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\
+ * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+ __le32 dqi_flags; /* Flags for quota file */
+ __le32 dqi_chunks; /* Number of chunks of quota structures
+ * with a bitmap */
+ __le32 dqi_blocks; /* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+ __le32 dqc_free; /* Number of free entries in the bitmap */
+ __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
+ * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/ __le64 dqb_id; /* id this quota applies to */
+ __le64 dqb_spacemod; /* Change in the amount of used space */
+/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/ struct ocfs2_block_check dq_check; /* Error checking */
+/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+ void *buf)
+{
+ char *ptr = buf;
+ ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+ return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
#ifdef __KERNEL__
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
{
@@ -495,6 +1273,20 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
offsetof(struct ocfs2_dinode, id2.i_symlink);
}
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+ struct ocfs2_dinode *di)
+{
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ return sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+ xattrsize;
+ else
+ return sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
{
int size;
@@ -505,6 +1297,34 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+ struct super_block *sb,
+ struct ocfs2_dinode *di)
+{
+ int size;
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+ xattrsize;
+ else
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
{
int size;
@@ -525,6 +1345,36 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+
+ return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+
+ return size / sizeof(struct ocfs2_dx_entry);
+}
+
static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
{
u16 size;
@@ -535,13 +1385,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
return size;
}
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+ int suballocator,
+ u32 feature_incompat)
{
- int size;
-
- size = sb->s_blocksize -
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -554,12 +1414,75 @@ static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
return size / sizeof(struct ocfs2_truncate_rec);
}
+
+static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
+{
+ u64 offset = OCFS2_BACKUP_SB_START;
+
+ if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
+ offset <<= (2 * index);
+ offset >>= sb->s_blocksize_bits;
+ return offset;
+ }
+
+ return 0;
+
+}
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_root.xt_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+ return size / sizeof(struct ocfs2_refcount_rec);
+}
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+ return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
#else
static inline int ocfs2_fast_symlink_chars(int blocksize)
{
return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
}
+static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
+ struct ocfs2_dinode *di)
+{
+ if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+ di->i_xattr_inline_size;
+ else
+ return blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
static inline int ocfs2_extent_recs_per_inode(int blocksize)
{
int size;
@@ -590,23 +1513,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
{
int size;
size = blocksize -
- offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
- return size;
+ return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
{
int size;
size = blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+ return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+ int suballocator,
+ uint32_t feature_incompat)
+{
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -619,6 +1562,30 @@ static inline int ocfs2_truncate_recs_per_inode(int blocksize)
return size / sizeof(struct ocfs2_truncate_rec);
}
+
+static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
+{
+ uint64_t offset = OCFS2_BACKUP_SB_START;
+
+ if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) {
+ offset <<= (2 * index);
+ offset /= blocksize;
+ return offset;
+ }
+
+ return 0;
+}
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+ int size;
+
+ size = blocksize -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_root.xt_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
#endif /* __KERNEL__ */
@@ -639,7 +1606,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
* list has a copy per slot.
*/
if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
- chars = snprintf(buf, len,
+ chars = snprintf(buf, len, "%s",
ocfs2_system_inodes[type].si_name);
else
chars = snprintf(buf, len,
@@ -655,5 +1622,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+ if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+ le16_to_cpu(gd->bg_size)) !=
+ offsetof(struct ocfs2_group_desc, bg_list))
+ return 0;
+ /*
+ * Only valid to check l_next_free_rec if
+ * bg_bitmap + bg_size == bg_list.
+ */
+ if (!gd->bg_list.l_next_free_rec)
+ return 0;
+ return 1;
+}
#endif /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 00000000000..5b27ff1fa57
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,242 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start;
+ __s64 l_len; /* len == 0 means until end of file */
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+};
+
+#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
+
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+ __u64 group; /* Group descriptor's blkno. */
+ __u32 clusters; /* Total number of clusters in this group */
+ __u32 frees; /* Total free clusters in this group */
+ __u16 chain; /* Chain for this group */
+ __u16 reserved1;
+ __u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
+
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+ __u64 old_path;
+ __u64 new_path;
+ __u64 preserve;
+};
+#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST (50)
+#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+ __u64 oi_requests; /* Array of __u64 pointers to requests */
+ __u32 oi_count; /* Number of requests in info_requests */
+ __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+ __u32 ir_code; /* Info request code */
+ __u32 ir_size; /* Size of request */
+ __u32 ir_flags; /* Request flags */
+/*10*/ /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+ struct ocfs2_info_request ic_req;
+ __u32 ic_clustersize;
+ __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+ struct ocfs2_info_request ib_req;
+ __u32 ib_blocksize;
+ __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+ struct ocfs2_info_request im_req;
+ __u32 im_max_slots;
+ __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+ struct ocfs2_info_request il_req;
+ __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+ struct ocfs2_info_request iu_req;
+ __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+ struct ocfs2_info_request if_req;
+ __u32 if_compat_features;
+ __u32 if_incompat_features;
+ __u32 if_ro_compat_features;
+ __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+ struct ocfs2_info_request ij_req;
+ __u64 ij_journal_size;
+};
+
+struct ocfs2_info_freeinode {
+ struct ocfs2_info_request ifi_req;
+ struct ocfs2_info_local_freeinode {
+ __u64 lfi_total;
+ __u64 lfi_free;
+ } ifi_stat[OCFS2_MAX_SLOTS];
+ __u32 ifi_slotnum; /* out */
+ __u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST (32)
+
+struct ocfs2_info_freefrag {
+ struct ocfs2_info_request iff_req;
+ struct ocfs2_info_freefrag_stats { /* (out) */
+ struct ocfs2_info_free_chunk_list {
+ __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+ __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+ } ffs_fc_hist;
+ __u32 ffs_clusters;
+ __u32 ffs_free_clusters;
+ __u32 ffs_free_chunks;
+ __u32 ffs_free_chunks_real;
+ __u32 ffs_min; /* Minimum free chunksize in clusters */
+ __u32 ffs_max;
+ __u32 ffs_avg;
+ __u32 ffs_pad;
+ } iff_ffs;
+ __u32 iff_chunksize; /* chunksize in clusters(in) */
+ __u32 iff_pad;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+ OCFS2_INFO_CLUSTERSIZE = 1,
+ OCFS2_INFO_BLOCKSIZE,
+ OCFS2_INFO_MAXSLOTS,
+ OCFS2_INFO_LABEL,
+ OCFS2_INFO_UUID,
+ OCFS2_INFO_FS_FEATURES,
+ OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_FREEINODE,
+ OCFS2_INFO_FREEFRAG,
+ OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
+ required. This is a hint.
+ It is up to ocfs2 whether
+ the request can be fulfilled
+ without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
+ this request and
+ filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
+ request handling. */
+
+#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+
+struct ocfs2_move_extents {
+/* All values are in bytes */
+ /* in */
+ __u64 me_start; /* Virtual start in the file to move */
+ __u64 me_len; /* Length of the extents to be moved */
+ __u64 me_goal; /* Physical offset of the goal,
+ it's in block unit */
+ __u64 me_threshold; /* Maximum distance from goal or threshold
+ for auto defragmentation */
+ __u64 me_flags; /* Flags for the operation:
+ * - auto defragmentation.
+ * - refcount,xattr cases.
+ */
+ /* out */
+ __u64 me_moved_len; /* Moved/defraged length */
+ __u64 me_new_offset; /* Resulting physical location */
+ __u32 me_reserved[2]; /* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
+ claim new clusters
+ as the goal place
+ for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
+ moving, is to make
+ movement less likely
+ to fail, may make fs
+ even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
+ completely gets done.
+ */
+
+#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
+
+#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c18..d277aabf5df 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,12 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
+ OCFS2_LOCK_TYPE_OPEN,
+ OCFS2_LOCK_TYPE_FLOCK,
+ OCFS2_LOCK_TYPE_QINFO,
+ OCFS2_LOCK_TYPE_NFS_SYNC,
+ OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ OCFS2_LOCK_TYPE_REFCOUNT,
OCFS2_NUM_LOCK_TYPES
};
@@ -69,6 +75,24 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ c = 'O';
+ break;
+ case OCFS2_LOCK_TYPE_FLOCK:
+ c = 'F';
+ break;
+ case OCFS2_LOCK_TYPE_QINFO:
+ c = 'Q';
+ break;
+ case OCFS2_LOCK_TYPE_NFS_SYNC:
+ c = 'Y';
+ break;
+ case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+ c = 'P';
+ break;
+ case OCFS2_LOCK_TYPE_REFCOUNT:
+ c = 'T';
+ break;
default:
c = '\0';
}
@@ -85,12 +109,18 @@ static char *ocfs2_lock_type_strings[] = {
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+ [OCFS2_LOCK_TYPE_OPEN] = "Open",
+ [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+ [OCFS2_LOCK_TYPE_QINFO] = "Quota",
+ [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
+ [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
+ [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
{
#ifdef __KERNEL__
- mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+ BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
#endif
return ocfs2_lock_type_strings[type];
}
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
new file mode 100644
index 00000000000..2e45c8d2ea7
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_lockingver.h
+ *
+ * Defines OCFS2 Locking version values.
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_LOCKINGVER_H
+#define OCFS2_LOCKINGVER_H
+
+/*
+ * The protocol version for ocfs2 cluster locking. See dlmglue.c for
+ * more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
+ */
+#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
+#define OCFS2_LOCKING_PROTOCOL_MINOR 0
+
+#endif /* OCFS2_LOCKINGVER_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 00000000000..6cb019b7c6a
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2768 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocfs2__int,
+ TP_PROTO(int num),
+ TP_ARGS(num),
+ TP_STRUCT__entry(
+ __field(int, num)
+ ),
+ TP_fast_assign(
+ __entry->num = num;
+ ),
+ TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__int, name, \
+ TP_PROTO(int num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__uint,
+ TP_PROTO(unsigned int num),
+ TP_ARGS(num),
+ TP_STRUCT__entry(
+ __field( unsigned int, num )
+ ),
+ TP_fast_assign(
+ __entry->num = num;
+ ),
+ TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint, name, \
+ TP_PROTO(unsigned int num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__ull,
+ TP_PROTO(unsigned long long blkno),
+ TP_ARGS(blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull, name, \
+ TP_PROTO(unsigned long long num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+ TP_PROTO(void *pointer),
+ TP_ARGS(pointer),
+ TP_STRUCT__entry(
+ __field(void *, pointer)
+ ),
+ TP_fast_assign(
+ __entry->pointer = pointer;
+ ),
+ TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name) \
+DEFINE_EVENT(ocfs2__pointer, name, \
+ TP_PROTO(void *pointer), \
+ TP_ARGS(pointer))
+
+DECLARE_EVENT_CLASS(ocfs2__string,
+ TP_PROTO(const char *name),
+ TP_ARGS(name),
+ TP_STRUCT__entry(
+ __string(name,name)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ ),
+ TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name) \
+DEFINE_EVENT(ocfs2__string, name, \
+ TP_PROTO(const char *name), \
+ TP_ARGS(name))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+ TP_PROTO(int value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(int, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__int_int, name, \
+ TP_PROTO(int val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+ TP_PROTO(unsigned int value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned int, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_int, name, \
+ TP_PROTO(unsigned int val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+ TP_PROTO(unsigned int value1, unsigned int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint, name, \
+ TP_PROTO(unsigned int val1, unsigned int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+ TP_PROTO(unsigned long long value1, unsigned int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint, name, \
+ TP_PROTO(unsigned long long val1, unsigned int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+ TP_PROTO(unsigned long long value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_int, name, \
+ TP_PROTO(unsigned long long val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+ TP_PROTO(unsigned long long value1, unsigned long long value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull, name, \
+ TP_PROTO(unsigned long long val1, unsigned long long val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+ TP_PROTO(unsigned long long value1,
+ unsigned long long value2, unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %llu %u",
+ __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_uint, name, \
+ TP_PROTO(unsigned long long val1, \
+ unsigned long long val2, unsigned int val3), \
+ TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+ TP_PROTO(unsigned long long value1,
+ unsigned int value2, unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned int, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %u %u", __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint_uint, name, \
+ TP_PROTO(unsigned long long val1, \
+ unsigned int val2, unsigned int val3), \
+ TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+ TP_PROTO(unsigned int value1, unsigned int value2,
+ unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field( unsigned int, value1 )
+ __field( unsigned int, value2 )
+ __field( unsigned int, value3 )
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint_uint, name, \
+ TP_PROTO(unsigned int value1, unsigned int value2, \
+ unsigned int value3), \
+ TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+ TP_PROTO(unsigned long long value1,
+ unsigned long long value2, unsigned long long value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned long long, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %llu %llu",
+ __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_ull, name, \
+ TP_PROTO(unsigned long long value1, unsigned long long value2, \
+ unsigned long long value3), \
+ TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+ TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+ TP_ARGS(ull, value1, value2, value3),
+ TP_STRUCT__entry(
+ __field( unsigned long long, ull )
+ __field( int, value1 )
+ __field( int, value2 )
+ __field( int, value3 )
+ ),
+ TP_fast_assign(
+ __entry->ull = ull;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %d %d %d",
+ __entry->ull, __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_int_int_int, name, \
+ TP_PROTO(unsigned long long ull, int value1, \
+ int value2, int value3), \
+ TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+ TP_PROTO(unsigned long long ull, unsigned int value1,
+ unsigned int value2, unsigned int value3),
+ TP_ARGS(ull, value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ull)
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->ull = ull;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->ull, __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \
+ TP_PROTO(unsigned long long ull, unsigned int value1, \
+ unsigned int value2, unsigned int value3), \
+ TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+ TP_PROTO(unsigned long long value1, unsigned long long value2,
+ unsigned int value3, unsigned int value4),
+ TP_ARGS(value1, value2, value3, value4),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned int, value3)
+ __field(unsigned int, value4)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ __entry->value4 = value4;
+ ),
+ TP_printk("%llu %llu %u %u",
+ __entry->value1, __entry->value2,
+ __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \
+ TP_PROTO(unsigned long long ull, unsigned long long ull1, \
+ unsigned int value2, unsigned int value3), \
+ TP_ARGS(ull, ull1, value2, value3))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+ TP_PROTO(unsigned long long owner,\
+ unsigned int value1, unsigned int value2),
+ TP_ARGS(owner, value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %u %u",
+ __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name) \
+DEFINE_EVENT(ocfs2__btree_ops, name, \
+ TP_PROTO(unsigned long long owner, \
+ unsigned int value1, unsigned int value2), \
+ TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+ TP_PROTO(unsigned long long owner, int depth),
+ TP_ARGS(owner, depth),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(int, depth)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->depth = depth;
+ ),
+ TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+ TP_PROTO(int subtree_root, unsigned long long blkno,
+ int depth),
+ TP_ARGS(subtree_root, blkno, depth),
+ TP_STRUCT__entry(
+ __field(int, subtree_root)
+ __field(unsigned long long, blkno)
+ __field(int, depth)
+ ),
+ TP_fast_assign(
+ __entry->subtree_root = subtree_root;
+ __entry->blkno = blkno;
+ __entry->depth = depth;
+ ),
+ TP_printk("%d %llu %d", __entry->subtree_root,
+ __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+ TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+ int ins_contig_index, int free_records, int ins_tree_depth),
+ TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+ ins_tree_depth),
+ TP_STRUCT__entry(
+ __field(unsigned int, ins_appending)
+ __field(unsigned int, ins_contig)
+ __field(int, ins_contig_index)
+ __field(int, free_records)
+ __field(int, ins_tree_depth)
+ ),
+ TP_fast_assign(
+ __entry->ins_appending = ins_appending;
+ __entry->ins_contig = ins_contig;
+ __entry->ins_contig_index = ins_contig_index;
+ __entry->free_records = free_records;
+ __entry->ins_tree_depth = ins_tree_depth;
+ ),
+ TP_printk("%u %u %d %d %d",
+ __entry->ins_appending, __entry->ins_contig,
+ __entry->ins_contig_index, __entry->free_records,
+ __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+ TP_PROTO(int split_index, unsigned int c_contig_type,
+ unsigned int c_has_empty_extent,
+ unsigned int c_split_covers_rec),
+ TP_ARGS(split_index, c_contig_type,
+ c_has_empty_extent, c_split_covers_rec),
+ TP_STRUCT__entry(
+ __field(int, split_index)
+ __field(unsigned int, c_contig_type)
+ __field(unsigned int, c_has_empty_extent)
+ __field(unsigned int, c_split_covers_rec)
+ ),
+ TP_fast_assign(
+ __entry->split_index = split_index;
+ __entry->c_contig_type = c_contig_type;
+ __entry->c_has_empty_extent = c_has_empty_extent;
+ __entry->c_split_covers_rec = c_split_covers_rec;
+ ),
+ TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+ __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+ TP_PROTO(unsigned long long owner, unsigned int cpos,
+ unsigned int len, int index,
+ unsigned int e_cpos, unsigned int clusters),
+ TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(int, index)
+ __field(unsigned int, e_cpos)
+ __field(unsigned int, clusters)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->index = index;
+ __entry->e_cpos = e_cpos;
+ __entry->clusters = clusters;
+ ),
+ TP_printk("%llu %u %u %d %u %u",
+ __entry->owner, __entry->cpos, __entry->len, __entry->index,
+ __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+ TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+ unsigned int clusters, unsigned int depth),
+ TP_ARGS(ino, new_cpos, clusters, depth),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, new_cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, depth)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->new_cpos = new_cpos;
+ __entry->clusters = clusters;
+ __entry->depth = depth;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->ino, __entry->new_cpos,
+ __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+ TP_PROTO(unsigned long long blkno),
+ TP_ARGS(blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+ TP_PROTO(unsigned int insert_cpos, int insert_index,
+ int has_empty, int next_free,
+ unsigned int l_count),
+ TP_ARGS(insert_cpos, insert_index, has_empty,
+ next_free, l_count),
+ TP_STRUCT__entry(
+ __field(unsigned int, insert_cpos)
+ __field(int, insert_index)
+ __field(int, has_empty)
+ __field(int, next_free)
+ __field(unsigned int, l_count)
+ ),
+ TP_fast_assign(
+ __entry->insert_cpos = insert_cpos;
+ __entry->insert_index = insert_index;
+ __entry->has_empty = has_empty;
+ __entry->next_free = next_free;
+ __entry->l_count = l_count;
+ ),
+ TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+ __entry->insert_index, __entry->has_empty,
+ __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+ TP_PROTO(int status, int reason, int err),
+ TP_ARGS(status, reason, err),
+ TP_STRUCT__entry(
+ __field(int, status)
+ __field(int, reason)
+ __field(int, err)
+ ),
+ TP_fast_assign(
+ __entry->status = status;
+ __entry->reason = reason;
+ __entry->err = err;
+ ),
+ TP_printk("%d %d %d", __entry->status,
+ __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+ TP_PROTO(unsigned long long owner, unsigned int cpos,
+ unsigned int len, unsigned int phys),
+ TP_ARGS(owner, cpos, len, phys),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(unsigned int, phys)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->phys = phys;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->owner, __entry->cpos,
+ __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+ TP_PROTO(unsigned long long blkno, int index,
+ unsigned int start, unsigned int num),
+ TP_ARGS(blkno, index, start, num),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __field(int, index)
+ __field(unsigned int, start)
+ __field(unsigned int, num)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __entry->index = index;
+ __entry->start = start;
+ __entry->num = num;
+ ),
+ TP_printk("%llu %d %u %u",
+ __entry->blkno, __entry->index,
+ __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \
+DEFINE_EVENT(ocfs2__truncate_log_ops, name, \
+ TP_PROTO(unsigned long long blkno, int index, \
+ unsigned int start, unsigned int num), \
+ TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+ TP_PROTO(int type, int slot, unsigned long long suballoc,
+ unsigned long long blkno, unsigned int bit),
+ TP_ARGS(type, slot, suballoc, blkno, bit),
+ TP_STRUCT__entry(
+ __field(int, type)
+ __field(int, slot)
+ __field(unsigned long long, suballoc)
+ __field(unsigned long long, blkno)
+ __field(unsigned int, bit)
+ ),
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->slot = slot;
+ __entry->suballoc = suballoc;
+ __entry->blkno = blkno;
+ __entry->bit = bit;
+ ),
+ TP_printk("%d %d %llu %llu %u",
+ __entry->type, __entry->slot, __entry->suballoc,
+ __entry->blkno, __entry->bit)
+);
+
+TRACE_EVENT(ocfs2_trim_extent,
+ TP_PROTO(struct super_block *sb, unsigned long long blk,
+ unsigned long long count),
+ TP_ARGS(sb, blk, count),
+ TP_STRUCT__entry(
+ __field(int, dev_major)
+ __field(int, dev_minor)
+ __field(unsigned long long, blk)
+ __field(__u64, count)
+ ),
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(sb->s_dev);
+ __entry->dev_minor = MINOR(sb->s_dev);
+ __entry->blk = blk;
+ __entry->count = count;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->dev_major, __entry->dev_minor,
+ __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
+
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+ TP_PROTO(int count, int bit, unsigned long long start_blk,
+ unsigned long long blkno),
+ TP_ARGS(count, bit, start_blk, blkno),
+ TP_STRUCT__entry(
+ __field(int, count)
+ __field(int, bit)
+ __field(unsigned long long, start_blk)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->count = count;
+ __entry->bit = bit;
+ __entry->start_blk = start_blk;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->count, __entry->bit, __entry->start_blk,
+ __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+ TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+ unsigned long long bg_blkno,
+ unsigned long long prev_blkno),
+ TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, i_blkno)
+ __field(unsigned int, chain)
+ __field(unsigned long long, bg_blkno)
+ __field(unsigned long long, prev_blkno)
+ ),
+ TP_fast_assign(
+ __entry->i_blkno = i_blkno;
+ __entry->chain = chain;
+ __entry->bg_blkno = bg_blkno;
+ __entry->prev_blkno = prev_blkno;
+ ),
+ TP_printk("%llu %u %llu %llu",
+ __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+ __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+ TP_PROTO(unsigned long long inode, unsigned long long group,
+ unsigned int start_bit, unsigned int count),
+ TP_ARGS(inode, group, start_bit, count),
+ TP_STRUCT__entry(
+ __field(unsigned long long, inode)
+ __field(unsigned long long, group)
+ __field(unsigned int, start_bit)
+ __field(unsigned int, count)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->group = group;
+ __entry->start_bit = start_bit;
+ __entry->count = count;
+ ),
+ TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+ __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+ TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+ unsigned int start_bit, unsigned int count),
+ TP_ARGS(bg_blkno, start_blk, start_bit, count),
+ TP_STRUCT__entry(
+ __field(unsigned long long, bg_blkno)
+ __field(unsigned long long, start_blk)
+ __field(unsigned int, start_bit)
+ __field(unsigned int, count)
+ ),
+ TP_fast_assign(
+ __entry->bg_blkno = bg_blkno;
+ __entry->start_blk = start_blk;
+ __entry->start_bit = start_bit;
+ __entry->count = count;
+ ),
+ TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+ __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+ TP_PROTO(unsigned long long blkno, int index,
+ unsigned long long cpos,
+ unsigned int clusters, unsigned int refcount),
+ TP_ARGS(blkno, index, cpos, clusters, refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __field(int, index)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, refcount)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __entry->index = index;
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->refcount = refcount;
+ ),
+ TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+ __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \
+ TP_PROTO(unsigned long long blkno, int index, \
+ unsigned long long cpos, \
+ unsigned int count, unsigned int refcount), \
+ TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+ TP_PROTO(unsigned long long cpos,
+ unsigned int clusters, unsigned int refcount,
+ unsigned long long split_cpos,
+ unsigned int split_clusters, unsigned int split_refcount),
+ TP_ARGS(cpos, clusters, refcount,
+ split_cpos, split_clusters, split_refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, refcount)
+ __field(unsigned long long, split_cpos)
+ __field(unsigned int, split_clusters)
+ __field(unsigned int, split_refcount)
+ ),
+ TP_fast_assign(
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->refcount = refcount;
+ __entry->split_cpos = split_cpos;
+ __entry->split_clusters = split_clusters;
+ __entry->split_refcount = split_refcount;
+ ),
+ TP_printk("%llu %u %u %llu %u %u",
+ __entry->cpos, __entry->clusters, __entry->refcount,
+ __entry->split_cpos, __entry->split_clusters,
+ __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+ TP_PROTO(unsigned long long owner,
+ unsigned long long cpos,
+ unsigned int len, int delete),
+ TP_ARGS(owner, cpos, len, delete),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, len)
+ __field(int, delete)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->delete = delete;
+ ),
+ TP_printk("%llu %llu %u %d",
+ __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+ TP_PROTO(int recs_add, unsigned long long cpos,
+ unsigned int clusters, unsigned long long r_cpos,
+ unsigned int r_clusters, unsigned int refcount, int index),
+ TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+ TP_STRUCT__entry(
+ __field(int, recs_add)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned long long, r_cpos)
+ __field(unsigned int, r_clusters)
+ __field(unsigned int, refcount)
+ __field(int, index)
+ ),
+ TP_fast_assign(
+ __entry->recs_add = recs_add;
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->r_cpos = r_cpos;
+ __entry->r_clusters = r_clusters;
+ __entry->refcount = refcount;
+ __entry->index = index;
+ ),
+ TP_printk("%d %llu %u %llu %u %u %d",
+ __entry->recs_add, __entry->cpos, __entry->clusters,
+ __entry->r_cpos, __entry->r_clusters,
+ __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int len, unsigned int p_cluster,
+ unsigned int ext_flags),
+ TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(unsigned int, p_cluster)
+ __field(unsigned int, ext_flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->p_cluster = p_cluster;
+ __entry->ext_flags = ext_flags;
+ ),
+ TP_printk("%llu %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->len,
+ __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int old, unsigned int new, unsigned int len,
+ unsigned int ext_flags),
+ TP_ARGS(ino, cpos, old, new, len, ext_flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, old)
+ __field(unsigned int, new)
+ __field(unsigned int, len)
+ __field(unsigned int, ext_flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->old = old;
+ __entry->new = new;
+ __entry->len = len;
+ __entry->ext_flags = ext_flags;
+ ),
+ TP_printk("%llu %u %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->old, __entry->new,
+ __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int write_len, unsigned int max_cpos,
+ unsigned int cow_start, unsigned int cow_len),
+ TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, write_len)
+ __field(unsigned int, max_cpos)
+ __field(unsigned int, cow_start)
+ __field(unsigned int, cow_len)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->write_len = write_len;
+ __entry->max_cpos = max_cpos;
+ __entry->cow_start = cow_start;
+ __entry->cow_len = cow_len;
+ ),
+ TP_printk("%llu %u %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->write_len,
+ __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+ TP_PROTO(unsigned long long ino, unsigned long long iblock,
+ void *bh_result, int create),
+ TP_ARGS(ino, iblock, bh_result, create),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, iblock)
+ __field(void *, bh_result)
+ __field(int, create)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->iblock = iblock;
+ __entry->bh_result = bh_result;
+ __entry->create = create;
+ ),
+ TP_printk("%llu %llu %p %d",
+ __entry->ino, __entry->iblock,
+ __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \
+DEFINE_EVENT(ocfs2__get_block, name, \
+ TP_PROTO(unsigned long long ino, unsigned long long iblock, \
+ void *bh_result, int create), \
+ TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+ TP_PROTO(unsigned long long ino, unsigned int len,
+ unsigned long long pos, unsigned int flags),
+ TP_ARGS(ino, len, pos, flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, len)
+ __field(unsigned long long, pos)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->len = len;
+ __entry->pos = pos;
+ __entry->flags = flags;
+ ),
+ TP_printk("%llu %u %llu 0x%x",
+ __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+ TP_PROTO(unsigned long long ino,
+ long long i_size, unsigned int i_clusters,
+ unsigned long long pos, unsigned int len,
+ unsigned int flags, void *page,
+ unsigned int clusters, unsigned int extents_to_split),
+ TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+ page, clusters, extents_to_split),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(long long, i_size)
+ __field(unsigned int, i_clusters)
+ __field(unsigned long long, pos)
+ __field(unsigned int, len)
+ __field(unsigned int, flags)
+ __field(void *, page)
+ __field(unsigned int, clusters)
+ __field(unsigned int, extents_to_split)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->i_size = i_size;
+ __entry->i_clusters = i_clusters;
+ __entry->pos = pos;
+ __entry->len = len;
+ __entry->flags = flags;
+ __entry->page = page;
+ __entry->clusters = clusters;
+ __entry->extents_to_split = extents_to_split;
+ ),
+ TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+ __entry->ino, __entry->i_size, __entry->i_clusters,
+ __entry->pos, __entry->len,
+ __entry->flags, __entry->page, __entry->clusters,
+ __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+ TP_PROTO(unsigned long long ino,
+ unsigned long long pos, unsigned int copied,
+ unsigned int id_count, unsigned int features),
+ TP_ARGS(ino, pos, copied, id_count, features),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, pos)
+ __field(unsigned int, copied)
+ __field(unsigned int, id_count)
+ __field(unsigned int, features)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->pos = pos;
+ __entry->copied = copied;
+ __entry->id_count = id_count;
+ __entry->features = features;
+ ),
+ TP_printk("%llu %llu %u %u %u",
+ __entry->ino, __entry->pos, __entry->copied,
+ __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+ TP_PROTO(unsigned long long ino,
+ void *area, void *page, unsigned long pgoff),
+ TP_ARGS(ino, area, page, pgoff),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(void *, area)
+ __field(void *, page)
+ __field(unsigned long, pgoff)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->area = area;
+ __entry->page = page;
+ __entry->pgoff = pgoff;
+ ),
+ TP_printk("%llu %p %p %lu",
+ __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+ TP_PROTO(void *inode, void *file, void *dentry,
+ unsigned long long ino,
+ unsigned int d_len, const unsigned char *d_name,
+ unsigned long long para),
+ TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(void *, file)
+ __field(void *, dentry)
+ __field(unsigned long long, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, d_name)
+ __field(unsigned long long, para)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->file = file;
+ __entry->dentry = dentry;
+ __entry->ino = ino;
+ __entry->d_len = d_len;
+ __assign_str(d_name, d_name);
+ __entry->para = para;
+ ),
+ TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+ __entry->dentry, __entry->ino, __entry->para,
+ __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name) \
+DEFINE_EVENT(ocfs2__file_ops, name, \
+TP_PROTO(void *inode, void *file, void *dentry, \
+ unsigned long long ino, \
+ unsigned int d_len, const unsigned char *d_name, \
+ unsigned long long mode), \
+ TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+ TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+ unsigned int clusters, unsigned int clusters_to_add,
+ int why, int restart_func),
+ TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ip_blkno)
+ __field(unsigned long long, size)
+ __field(unsigned int, clusters)
+ __field(unsigned int, clusters_to_add)
+ __field(int, why)
+ __field(int, restart_func)
+ ),
+ TP_fast_assign(
+ __entry->ip_blkno = ip_blkno;
+ __entry->size = size;
+ __entry->clusters = clusters;
+ __entry->clusters_to_add = clusters_to_add;
+ __entry->why = why;
+ __entry->restart_func = restart_func;
+ ),
+ TP_printk("%llu %llu %u %u %d %d",
+ __entry->ip_blkno, __entry->size, __entry->clusters,
+ __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+ TP_PROTO(unsigned long long ino,
+ unsigned int di_clusters, unsigned long long di_size,
+ unsigned int ip_clusters, unsigned long long i_size),
+ TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, di_clusters)
+ __field(unsigned long long, di_size)
+ __field(unsigned int, ip_clusters)
+ __field(unsigned long long, i_size)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->di_clusters = di_clusters;
+ __entry->di_size = di_size;
+ __entry->ip_clusters = ip_clusters;
+ __entry->i_size = i_size;
+ ),
+ TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+ __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+ TP_PROTO(unsigned long long ino,
+ unsigned long long abs_from, unsigned long long abs_to,
+ unsigned long index, unsigned int zero_from,
+ unsigned int zero_to),
+ TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, abs_from)
+ __field(unsigned long long, abs_to)
+ __field(unsigned long, index)
+ __field(unsigned int, zero_from)
+ __field(unsigned int, zero_to)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->abs_from = abs_from;
+ __entry->abs_to = abs_to;
+ __entry->index = index;
+ __entry->zero_from = zero_from;
+ __entry->zero_to = zero_to;
+ ),
+ TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+ __entry->abs_from, __entry->abs_to,
+ __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+ TP_PROTO(void *inode, void *dentry,
+ unsigned long long ino,
+ unsigned int d_len, const unsigned char *d_name,
+ unsigned int ia_valid, unsigned int ia_mode,
+ unsigned int ia_uid, unsigned int ia_gid),
+ TP_ARGS(inode, dentry, ino, d_len, d_name,
+ ia_valid, ia_mode, ia_uid, ia_gid),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(void *, dentry)
+ __field(unsigned long long, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, d_name)
+ __field(unsigned int, ia_valid)
+ __field(unsigned int, ia_mode)
+ __field(unsigned int, ia_uid)
+ __field(unsigned int, ia_gid)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->dentry = dentry;
+ __entry->ino = ino;
+ __entry->d_len = d_len;
+ __assign_str(d_name, d_name);
+ __entry->ia_valid = ia_valid;
+ __entry->ia_mode = ia_mode;
+ __entry->ia_uid = ia_uid;
+ __entry->ia_gid = ia_gid;
+ ),
+ TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+ __entry->dentry, __entry->ino, __entry->d_len,
+ __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+ __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+ TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+ int appending, unsigned long count,
+ int *direct_io, int *has_refcount),
+ TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, saved_pos)
+ __field(int, appending)
+ __field(unsigned long, count)
+ __field(int, direct_io)
+ __field(int, has_refcount)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->saved_pos = saved_pos;
+ __entry->appending = appending;
+ __entry->count = count;
+ __entry->direct_io = direct_io ? *direct_io : -1;
+ __entry->has_refcount = has_refcount ? *has_refcount : -1;
+ ),
+ TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+ __entry->saved_pos, __entry->appending, __entry->count,
+ __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+ TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+ TP_ARGS(ino, flags, sysfile_type),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ __field(int, sysfile_type)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->flags = flags;
+ __entry->sysfile_type = sysfile_type;
+ ),
+ TP_printk("%llu %u %d", __entry->ino,
+ __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+ TP_PROTO(void *inode, unsigned long long ino),
+ TP_ARGS(inode, ino),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ ),
+ TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+ TP_PROTO(void *inode, unsigned long long ino,
+ void *args, unsigned long long fi_blkno),
+ TP_ARGS(inode, ino, args, fi_blkno),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ __field(void *, args)
+ __field(unsigned long long, fi_blkno)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->args = args;
+ __entry->fi_blkno = fi_blkno;
+ ),
+ TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+ __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+ TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+ unsigned int flags),
+ TP_ARGS(task, dc_task, ino, flags),
+ TP_STRUCT__entry(
+ __field(void *, task)
+ __field(void *, dc_task)
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->task = task;
+ __entry->dc_task = dc_task;
+ __entry->ino = ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+ __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+ TP_PROTO(void *inode, unsigned long long ino,
+ unsigned int flags),
+ TP_ARGS(inode, ino, flags),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+ TP_PROTO(void *inode, unsigned long long vblock, int nr,
+ void *bhs, unsigned int flags, void *validate),
+ TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, vblock)
+ __field(int, nr)
+ __field(void *, bhs)
+ __field(unsigned int, flags)
+ __field(void *, validate)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->vblock = vblock;
+ __entry->nr = nr;
+ __entry->bhs = bhs;
+ __entry->flags = flags;
+ __entry->validate = validate;
+ ),
+ TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+ __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+ TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+ TP_ARGS(s_flags, osb_flags, flags),
+ TP_STRUCT__entry(
+ __field(unsigned long, s_flags)
+ __field(unsigned long, osb_flags)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->s_flags = s_flags;
+ __entry->osb_flags = osb_flags;
+ __entry->flags = flags;
+ ),
+ TP_printk("%lu %lu %d", __entry->s_flags,
+ __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+ TP_PROTO(void *sb, void *data, int silent),
+ TP_ARGS(sb, data, silent),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, data)
+ __field(int, silent)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->data = data;
+ __entry->silent = silent;
+ ),
+ TP_printk("%p %p %d", __entry->sb,
+ __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+ TP_PROTO(int is_remount, char *options),
+ TP_ARGS(is_remount, options),
+ TP_STRUCT__entry(
+ __field(int, is_remount)
+ __string(options, options)
+ ),
+ TP_fast_assign(
+ __entry->is_remount = is_remount;
+ __assign_str(options, options);
+ ),
+ TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+ TP_PROTO(void *sb, void *buf),
+ TP_ARGS(sb, buf),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, buf)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->buf = buf;
+ ),
+ TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+ TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+ unsigned long long system_dir, int cluster_bits),
+ TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+ TP_STRUCT__entry(
+ __string(label, label)
+ __string(uuid_str, uuid_str)
+ __field(unsigned long long, root_dir)
+ __field(unsigned long long, system_dir)
+ __field(int, cluster_bits)
+ ),
+ TP_fast_assign(
+ __assign_str(label, label);
+ __assign_str(uuid_str, uuid_str);
+ __entry->root_dir = root_dir;
+ __entry->system_dir = system_dir;
+ __entry->cluster_bits = cluster_bits;
+ ),
+ TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+ __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+ TP_PROTO(const char *name, int meta, int clusters, int credits),
+ TP_ARGS(name, meta, clusters, credits),
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(int, meta)
+ __field(int, clusters)
+ __field(int, credits)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->meta = meta;
+ __entry->clusters = clusters;
+ __entry->credits = credits;
+ ),
+ TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+ __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+ TP_PROTO(unsigned long long ino, const char *name, int name_index,
+ unsigned int hash, unsigned long long location,
+ int xe_index),
+ TP_ARGS(ino, name, name_index, hash, location, xe_index),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __string(name, name)
+ __field(int, name_index)
+ __field(unsigned int, hash)
+ __field(unsigned long long, location)
+ __field(int, xe_index)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __assign_str(name, name);
+ __entry->name_index = name_index;
+ __entry->hash = hash;
+ __entry->location = location;
+ __entry->xe_index = xe_index;
+ ),
+ TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+ __entry->name_index, __entry->hash, __entry->location,
+ __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \
+DEFINE_EVENT(ocfs2__xattr_find, name, \
+TP_PROTO(unsigned long long ino, const char *name, int name_index, \
+ unsigned int hash, unsigned long long bucket, \
+ int xe_index), \
+ TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+ TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+ unsigned int wanted, int empty_root),
+ TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+ TP_STRUCT__entry(
+ __field(unsigned int, r_start)
+ __field(unsigned int, r_end)
+ __field(unsigned int, goal)
+ __field(unsigned int, wanted)
+ __field(int, empty_root)
+ ),
+ TP_fast_assign(
+ __entry->r_start = r_start;
+ __entry->r_end = r_end;
+ __entry->goal = goal;
+ __entry->wanted = wanted;
+ __entry->empty_root = empty_root;
+ ),
+ TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+ __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+ TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(start, end, len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, start)
+ __field(unsigned int, end)
+ __field(unsigned int, len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->len = len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+ __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+ TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+ unsigned int r_start, unsigned int r_end, unsigned int r_len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(cstart, cend, clen, r_start, r_end,
+ r_len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, cstart)
+ __field(unsigned int, cend)
+ __field(unsigned int, clen)
+ __field(unsigned int, r_start)
+ __field(unsigned int, r_end)
+ __field(unsigned int, r_len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->cstart = cstart;
+ __entry->cend = cend;
+ __entry->clen = clen;
+ __entry->r_start = r_start;
+ __entry->r_end = r_end;
+ __entry->r_len = r_len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u %u %u %u",
+ __entry->cstart, __entry->cend, __entry->clen,
+ __entry->r_start, __entry->r_end, __entry->r_len,
+ __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+ TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(start, end, len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, start)
+ __field(unsigned int, end)
+ __field(unsigned int, len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->len = len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+ __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+ TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+ long long spacechange, long long curinodes,
+ long long inodechange),
+ TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+ TP_STRUCT__entry(
+ __field(unsigned int, dq_id)
+ __field(long long, dqb_curspace)
+ __field(long long, spacechange)
+ __field(long long, curinodes)
+ __field(long long, inodechange)
+ ),
+ TP_fast_assign(
+ __entry->dq_id = dq_id;
+ __entry->dqb_curspace = dqb_curspace;
+ __entry->spacechange = spacechange;
+ __entry->curinodes = curinodes;
+ __entry->inodechange = inodechange;
+ ),
+ TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+ __entry->dqb_curspace, __entry->spacechange,
+ __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+ TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+ const char *s_id),
+ TP_ARGS(dq_id, dq_type, type, s_id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, dq_id)
+ __field(unsigned int, dq_type)
+ __field(unsigned long, type)
+ __string(s_id, s_id)
+ ),
+ TP_fast_assign(
+ __entry->dq_id = dq_id;
+ __entry->dq_type = dq_type;
+ __entry->type = type;
+ __assign_str(s_id, s_id);
+ ),
+ TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+ __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+ TP_PROTO(unsigned long long ino, int namelen, const char *name,
+ unsigned int major_hash, unsigned int minor_hash,
+ unsigned long long blkno),
+ TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(int, namelen)
+ __string(name, name)
+ __field(unsigned int, major_hash)
+ __field(unsigned int,minor_hash)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->major_hash = major_hash;
+ __entry->minor_hash = minor_hash;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+ __entry->namelen, __get_str(name),
+ __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+ TP_PROTO(int namelen, const char *name, void *blkno,
+ unsigned long long dir),
+ TP_ARGS(namelen, name, blkno, dir),
+ TP_STRUCT__entry(
+ __field(int, namelen)
+ __string(name, name)
+ __field(void *, blkno)
+ __field(unsigned long long, dir)
+ ),
+ TP_fast_assign(
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->blkno = blkno;
+ __entry->dir = dir;
+ ),
+ TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+ __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+ TP_PROTO(unsigned long long dir, int namelen, const char *name),
+ TP_ARGS(dir, namelen, name),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __field(int, namelen)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ ),
+ TP_printk("%llu %.*s", __entry->dir,
+ __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+ TP_PROTO(unsigned long long dir,
+ unsigned int major_hash, unsigned int minor_hash,
+ int namelen, const char *name, unsigned int num_used),
+ TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __field(unsigned int, major_hash)
+ __field(unsigned int, minor_hash)
+ __field(int, namelen)
+ __string(name, name)
+ __field(unsigned int, num_used)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->major_hash = major_hash;
+ __entry->minor_hash = minor_hash;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->num_used = num_used;
+ ),
+ TP_printk("%llu %x %x %.*s %u", __entry->dir,
+ __entry->major_hash, __entry->minor_hash,
+ __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+ TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+ unsigned long long dir_blkno, unsigned long long extra),
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(unsigned long long, dir_blkno)
+ __field(unsigned long long, extra)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->dir_blkno = dir_blkno;
+ __entry->extra = extra;
+ ),
+ TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+ __entry->name_len, __get_str(name),
+ __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name) \
+DEFINE_EVENT(ocfs2__dentry_ops, name, \
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \
+ unsigned long long dir_blkno, unsigned long long extra), \
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+ TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+ unsigned long long dir_blkno, unsigned long dev, int mode),
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(unsigned long long, dir_blkno)
+ __field(unsigned long, dev)
+ __field(int, mode)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->dir_blkno = dir_blkno;
+ __entry->dev = dev;
+ __entry->mode = mode;
+ ),
+ TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+ __entry->name_len, __get_str(name),
+ __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+ TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+ int name_len, const char *name),
+ TP_ARGS(ino, old_len, old_name, name_len, name),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(int, old_len)
+ __string(old_name, old_name)
+ __field(int, name_len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->old_len = old_len;
+ __assign_str(old_name, old_name);
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%llu %.*s %.*s", __entry->ino,
+ __entry->old_len, __get_str(old_name),
+ __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+ TP_PROTO(void *old_dir, void *old_dentry,
+ void *new_dir, void *new_dentry,
+ int old_len, const char *old_name,
+ int new_len, const char *new_name),
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+ old_len, old_name, new_len, new_name),
+ TP_STRUCT__entry(
+ __field(void *, old_dir)
+ __field(void *, old_dentry)
+ __field(void *, new_dir)
+ __field(void *, new_dentry)
+ __field(int, old_len)
+ __string(old_name, old_name)
+ __field(int, new_len)
+ __string(new_name, new_name)
+ ),
+ TP_fast_assign(
+ __entry->old_dir = old_dir;
+ __entry->old_dentry = old_dentry;
+ __entry->new_dir = new_dir;
+ __entry->new_dentry = new_dentry;
+ __entry->old_len = old_len;
+ __assign_str(old_name, old_name);
+ __entry->new_len = new_len;
+ __assign_str(new_name, new_name);
+ ),
+ TP_printk("%p %p %p %p %.*s %.*s",
+ __entry->old_dir, __entry->old_dentry,
+ __entry->new_dir, __entry->new_dentry,
+ __entry->old_len, __get_str(old_name),
+ __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+ TP_PROTO(int new_len, const char *new_name),
+ TP_ARGS(new_len, new_name),
+ TP_STRUCT__entry(
+ __field(int, new_len)
+ __string(new_name, new_name)
+ ),
+ TP_fast_assign(
+ __entry->new_len = new_len;
+ __assign_str(new_name, new_name);
+ ),
+ TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+ TP_PROTO(unsigned long long new_blkno, void *new_bh,
+ unsigned long long newdi_blkno),
+ TP_ARGS(new_blkno, new_bh, newdi_blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, new_blkno)
+ __field(void *, new_bh)
+ __field(unsigned long long, newdi_blkno)
+ ),
+ TP_fast_assign(
+ __entry->new_blkno = new_blkno;
+ __entry->new_bh = new_bh;
+ __entry->newdi_blkno = newdi_blkno;
+ ),
+ TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+ __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+ TP_PROTO(void *dir, void *dentry, const char *symname,
+ int len, const char *name),
+ TP_ARGS(dir, dentry, symname, len, name),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(const char *, symname)
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->symname = symname;
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+ __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+ TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+ TP_ARGS(blkno, name, namelen),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __string(name, name)
+ __field(int, namelen)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __assign_str(name, name);
+ __entry->namelen = namelen;
+ ),
+ TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+ __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+ TP_PROTO(unsigned long long dir, const char *name, int namelen),
+ TP_ARGS(dir, name, namelen),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __string(name, name)
+ __field(int, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __assign_str(name, name);
+ __entry->namelen = namelen;
+ ),
+ TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+ __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+ TP_PROTO(void *dentry, int len, const char *name),
+ TP_ARGS(dentry, len, name),
+ TP_STRUCT__entry(
+ __field(void *, dentry)
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dentry = dentry;
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+ TP_PROTO(int len, const char *name, unsigned long pgen,
+ unsigned long gen),
+ TP_ARGS(len, name, pgen, gen),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long, pgen)
+ __field(unsigned long, gen)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->pgen = pgen;
+ __entry->gen = gen;
+ ),
+ TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+ __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+ TP_PROTO(int len, const char *name),
+ TP_ARGS(len, name),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+ TP_PROTO(int len, const char *name,
+ unsigned long long parent, void *fsdata),
+ TP_ARGS(len, name, parent, fsdata),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long long, parent)
+ __field(void *, fsdata)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->parent = parent;
+ __entry->fsdata = fsdata;
+ ),
+ TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+ __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+ TP_PROTO(const char *name, unsigned long long parent,
+ unsigned long long ino),
+ TP_ARGS(name, parent, ino),
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long long, parent)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->parent = parent;
+ __entry->ino = ino;
+ ),
+ TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+ TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+ TP_ARGS(sb, handle, blkno),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, handle)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->handle = handle;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+ TP_PROTO(void *child, int len, const char *name,
+ unsigned long long ino),
+ TP_ARGS(child, len, name, ino),
+ TP_STRUCT__entry(
+ __field(void *, child)
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __entry->child = child;
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->ino = ino;
+ ),
+ TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+ __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+ TP_PROTO(void *dentry, int name_len, const char *name,
+ void *fh, int len, int connectable),
+ TP_ARGS(dentry, name_len, name, fh, len, connectable),
+ TP_STRUCT__entry(
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(void *, fh)
+ __field(int, len)
+ __field(int, connectable)
+ ),
+ TP_fast_assign(
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->fh = fh;
+ __entry->len = len;
+ __entry->connectable = connectable;
+ ),
+ TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+ __get_str(name), __entry->fh, __entry->len,
+ __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+ TP_PROTO(int slot, unsigned long long la_ino,
+ unsigned long long tl_ino, void *qrec),
+ TP_ARGS(slot, la_ino, tl_ino, qrec),
+ TP_STRUCT__entry(
+ __field(int, slot)
+ __field(unsigned long long, la_ino)
+ __field(unsigned long long, tl_ino)
+ __field(void *, qrec)
+ ),
+ TP_fast_assign(
+ __entry->slot = slot;
+ __entry->la_ino = la_ino;
+ __entry->tl_ino = tl_ino;
+ __entry->qrec = qrec;
+ ),
+ TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+ __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+ TP_PROTO(int node_num, int osb_node_num, int disable,
+ void *recovery_thread, int map_set),
+ TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+ TP_STRUCT__entry(
+ __field(int, node_num)
+ __field(int, osb_node_num)
+ __field(int,disable)
+ __field(void *, recovery_thread)
+ __field(int,map_set)
+ ),
+ TP_fast_assign(
+ __entry->node_num = node_num;
+ __entry->osb_node_num = osb_node_num;
+ __entry->disable = disable;
+ __entry->recovery_thread = recovery_thread;
+ __entry->map_set = map_set;
+ ),
+ TP_printk("%d %d %d %p %d", __entry->node_num,
+ __entry->osb_node_num, __entry->disable,
+ __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+ TP_PROTO(unsigned long long block, void *ci),
+ TP_ARGS(block, ci),
+ TP_STRUCT__entry(
+ __field(unsigned long long, block)
+ __field(void *, ci)
+ ),
+ TP_fast_assign(
+ __entry->block = block;
+ __entry->ci = ci;
+ ),
+ TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+ TP_PROTO(void *ci, unsigned long long block,
+ unsigned int nr, int flags),
+ TP_ARGS(ci, block, nr, flags),
+ TP_STRUCT__entry(
+ __field(void *, ci)
+ __field(unsigned long long, block)
+ __field(unsigned int, nr)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->ci = ci;
+ __entry->block = block;
+ __entry->nr = nr;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+ __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+ TP_PROTO(int index, void *item),
+ TP_ARGS(index, item),
+ TP_STRUCT__entry(
+ __field(int, index)
+ __field(void *, item)
+ ),
+ TP_fast_assign(
+ __entry->index = index;
+ __entry->item = item;
+ ),
+ TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 00000000000..f266d67df3c
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+ struct dquot dq_dquot; /* Generic VFS dquot */
+ loff_t dq_local_off; /* Offset in the local quota file */
+ u64 dq_local_phys_blk; /* Physical block carrying quota structure */
+ struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
+ unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
+ s64 dq_origspace; /* Last globally synced space usage */
+ s64 dq_originodes; /* Last globally synced inode usage */
+ struct llist_node list; /* Member of list of dquots to drop */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+ struct list_head rc_list; /* List of chunks */
+ int rc_chunk; /* Chunk number */
+ unsigned long *rc_bitmap; /* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+ struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+ unsigned int dqi_type; /* Quota type this structure describes */
+ unsigned int dqi_chunks; /* Number of chunks in local quota file */
+ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
+ unsigned int dqi_syncms; /* How often should we sync with other nodes */
+ struct list_head dqi_chunk; /* List of chunks */
+ struct inode *dqi_gqinode; /* Global quota file inode */
+ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
+ struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+ int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
+ u64 dqi_giblk; /* Number of block with global information header */
+ struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+ struct buffer_head *dqi_libh; /* Buffer with local information header */
+ struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+ struct delayed_work dqi_sync_work; /* Work for syncing dquots */
+ struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
+ * information, in case we
+ * enable quotas on file
+ * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+ return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+ struct list_head qc_chunk; /* List of quotafile chunks */
+ int qc_num; /* Number of quota chunk */
+ struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+ struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+ struct ocfs2_quota_recovery *rec,
+ int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+ return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+ return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+ struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
+
+extern const struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 00000000000..b990a62cff5
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,971 @@
+/*
+ * Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#include <linux/llist.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "quota.h"
+#include "ocfs2_trace.h"
+
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ * by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * and ip_alloc_sem of the global quota file (achieved by
+ * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ * its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ * start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * write to gf
+ * -> write to lf
+ * Acquire dquot for the first time:
+ * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ * -> alloc space for gf
+ * -> start_trans -> qinfo_lock -> write to gf
+ * -> ip_alloc_sem of lf -> alloc space for lf
+ * -> write to lf
+ * Release last reference to dquot:
+ * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ * -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ * inode cluster lock of recovered lf
+ * -> read bitmaps -> ip_alloc_sem of lf
+ * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * write to gf
+ */
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
+ /* Update from disk only entries not set by the admin */
+ if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+ m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+ m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+ }
+ if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+ m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+ if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+ m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+ m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+ }
+ if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+ m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+ if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+ m->dqb_btime = le64_to_cpu(d->dqb_btime);
+ if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+ m->dqb_itime = le64_to_cpu(d->dqb_itime);
+ OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
+ d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
+ d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+ d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+ d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+ d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+ d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+ d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+ d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+ d->dqb_btime = cpu_to_le64(m->dqb_btime);
+ d->dqb_itime = cpu_to_le64(m->dqb_itime);
+ d->dqb_pad1 = d->dqb_pad2 = 0;
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+
+ if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+ return 0;
+
+ return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
+ le32_to_cpu(d->dqb_id)),
+ dquot->dq_id);
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+ .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+ .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+ .is_id = ocfs2_global_is_id,
+};
+
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
+{
+ struct ocfs2_disk_dqtrailer *dqt =
+ ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+ trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+ struct buffer_head **bhp)
+{
+ int rc;
+
+ *bhp = NULL;
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
+ ocfs2_validate_quota_block);
+ if (rc)
+ mlog_errno(rc);
+ return rc;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off)
+{
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct inode *gqinode = oinfo->dqi_gqinode;
+ loff_t i_size = i_size_read(gqinode);
+ int offset = off & (sb->s_blocksize - 1);
+ sector_t blk = off >> sb->s_blocksize_bits;
+ int err = 0;
+ struct buffer_head *bh;
+ size_t toread, tocopy;
+ u64 pblock = 0, pcount = 0;
+
+ if (off > i_size)
+ return 0;
+ if (off + len > i_size)
+ len = i_size - off;
+ toread = len;
+ while (toread > 0) {
+ tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+ if (!pcount) {
+ err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+ &pcount, NULL);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ } else {
+ pcount--;
+ pblock++;
+ }
+ bh = NULL;
+ err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ memcpy(data, bh->b_data + offset, tocopy);
+ brelse(bh);
+ offset = 0;
+ toread -= tocopy;
+ data += tocopy;
+ blk++;
+ }
+ return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct inode *gqinode = oinfo->dqi_gqinode;
+ int offset = off & (sb->s_blocksize - 1);
+ sector_t blk = off >> sb->s_blocksize_bits;
+ int err = 0, new = 0, ja_type;
+ struct buffer_head *bh = NULL;
+ handle_t *handle = journal_current_handle();
+ u64 pblock, pcount;
+
+ if (!handle) {
+ mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+ "because transaction was not started.\n",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+ if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+ WARN_ON(1);
+ len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+ }
+
+ if (i_size_read(gqinode) < off + len) {
+ loff_t rounded_end =
+ ocfs2_align_bytes_to_blocks(sb, off + len);
+
+ /* Space is already allocated in ocfs2_acquire_dquot() */
+ err = ocfs2_simple_size_update(gqinode,
+ oinfo->dqi_gqi_bh,
+ rounded_end);
+ if (err < 0)
+ goto out;
+ new = 1;
+ }
+ err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+ if (err) {
+ mlog_errno(err);
+ goto out;
+ }
+ /* Not rewriting whole block? */
+ if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+ !new) {
+ err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
+ ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+ } else {
+ bh = sb_getblk(sb, pblock);
+ if (!bh)
+ err = -ENOMEM;
+ ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+ }
+ if (err) {
+ mlog_errno(err);
+ goto out;
+ }
+ lock_buffer(bh);
+ if (new)
+ memset(bh->b_data, 0, sb->s_blocksize);
+ memcpy(bh->b_data + offset, data, len);
+ flush_dcache_page(bh->b_page);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
+ err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
+ ja_type);
+ if (err < 0) {
+ brelse(bh);
+ goto out;
+ }
+ ocfs2_journal_dirty(handle, bh);
+ brelse(bh);
+out:
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ gqinode->i_version++;
+ ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+ return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ int status;
+ struct buffer_head *bh = NULL;
+
+ status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+ if (status < 0)
+ return status;
+ spin_lock(&dq_data_lock);
+ if (!oinfo->dqi_gqi_count++)
+ oinfo->dqi_gqi_bh = bh;
+ else
+ WARN_ON(bh != oinfo->dqi_gqi_bh);
+ spin_unlock(&dq_data_lock);
+ if (ex) {
+ mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+ down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ } else {
+ down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ }
+ return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ if (ex) {
+ up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+ } else {
+ up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ }
+ ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+ brelse(oinfo->dqi_gqi_bh);
+ spin_lock(&dq_data_lock);
+ if (!--oinfo->dqi_gqi_count)
+ oinfo->dqi_gqi_bh = NULL;
+ spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+ struct inode *gqinode = NULL;
+ unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
+ struct ocfs2_global_disk_dqinfo dinfo;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ u64 pcount;
+ int status;
+
+ /* Read global header */
+ gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!gqinode) {
+ mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+ type);
+ status = -EINVAL;
+ goto out_err;
+ }
+ oinfo->dqi_gi.dqi_sb = sb;
+ oinfo->dqi_gi.dqi_type = type;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+ oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+ oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+ oinfo->dqi_gqi_bh = NULL;
+ oinfo->dqi_gqi_count = 0;
+ oinfo->dqi_gqinode = gqinode;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ &pcount, NULL);
+ if (status < 0)
+ goto out_unlock;
+
+ status = ocfs2_qinfo_lock(oinfo, 0);
+ if (status < 0)
+ goto out_unlock;
+ status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+ sizeof(struct ocfs2_global_disk_dqinfo),
+ OCFS2_GLOBAL_INFO_OFF);
+ ocfs2_qinfo_unlock(oinfo, 0);
+ ocfs2_unlock_global_qf(oinfo, 0);
+ if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+ mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+ status);
+ if (status >= 0)
+ status = -EIO;
+ mlog_errno(status);
+ goto out_err;
+ }
+ info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+ oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+ oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+ oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+ oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+ oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+ OCFS2_QBLK_RESERVED_SPACE;
+ oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+ INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+ schedule_delayed_work(&oinfo->dqi_sync_work,
+ msecs_to_jiffies(oinfo->dqi_syncms));
+
+out_err:
+ return status;
+out_unlock:
+ ocfs2_unlock_global_qf(oinfo, 0);
+ mlog_errno(status);
+ goto out_err;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_global_disk_dqinfo dinfo;
+ ssize_t size;
+
+ spin_lock(&dq_data_lock);
+ info->dqi_flags &= ~DQF_INFO_DIRTY;
+ dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+ dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+ spin_unlock(&dq_data_lock);
+ dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+ dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+ dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+ dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+ size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+ sizeof(struct ocfs2_global_disk_dqinfo),
+ OCFS2_GLOBAL_INFO_OFF);
+ if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+ mlog(ML_ERROR, "Cannot write global quota info structure\n");
+ if (size >= 0)
+ size = -EIO;
+ return size;
+ }
+ return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+ int err;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+ err = ocfs2_qinfo_lock(info, 1);
+ if (err < 0)
+ return err;
+ err = __ocfs2_global_write_info(sb, type);
+ ocfs2_qinfo_unlock(info, 1);
+ return err;
+}
+
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+ /*
+ * We may need to allocate tree blocks and a leaf block but not the
+ * root block
+ */
+ return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+ /* We modify all the allocated blocks, tree root, info block and
+ * the inode */
+ return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+ int err, err2;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_id.type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_global_disk_dqblk dqblk;
+ s64 spacechange, inodechange;
+ time_t olditime, oldbtime;
+
+ err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+ sizeof(struct ocfs2_global_disk_dqblk),
+ dquot->dq_off);
+ if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+ if (err >= 0) {
+ mlog(ML_ERROR, "Short read from global quota file "
+ "(%u read)\n", err);
+ err = -EIO;
+ }
+ goto out;
+ }
+
+ /* Update space and inode usage. Get also other information from
+ * global quota file so that we don't overwrite any changes there.
+ * We are */
+ spin_lock(&dq_data_lock);
+ spacechange = dquot->dq_dqb.dqb_curspace -
+ OCFS2_DQUOT(dquot)->dq_origspace;
+ inodechange = dquot->dq_dqb.dqb_curinodes -
+ OCFS2_DQUOT(dquot)->dq_originodes;
+ olditime = dquot->dq_dqb.dqb_itime;
+ oldbtime = dquot->dq_dqb.dqb_btime;
+ ocfs2_global_disk2memdqb(dquot, &dqblk);
+ trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_dqb.dqb_curspace,
+ (long long)spacechange,
+ dquot->dq_dqb.dqb_curinodes,
+ (long long)inodechange);
+ if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+ dquot->dq_dqb.dqb_curspace += spacechange;
+ if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+ dquot->dq_dqb.dqb_curinodes += inodechange;
+ /* Set properly space grace time... */
+ if (dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+ if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+ oldbtime > 0) {
+ if (dquot->dq_dqb.dqb_btime > 0)
+ dquot->dq_dqb.dqb_btime =
+ min(dquot->dq_dqb.dqb_btime, oldbtime);
+ else
+ dquot->dq_dqb.dqb_btime = oldbtime;
+ }
+ } else {
+ dquot->dq_dqb.dqb_btime = 0;
+ clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+ }
+ /* Set properly inode grace time... */
+ if (dquot->dq_dqb.dqb_isoftlimit &&
+ dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+ if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+ olditime > 0) {
+ if (dquot->dq_dqb.dqb_itime > 0)
+ dquot->dq_dqb.dqb_itime =
+ min(dquot->dq_dqb.dqb_itime, olditime);
+ else
+ dquot->dq_dqb.dqb_itime = olditime;
+ }
+ } else {
+ dquot->dq_dqb.dqb_itime = 0;
+ clear_bit(DQ_INODES_B, &dquot->dq_flags);
+ }
+ /* All information is properly updated, clear the flags */
+ __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ spin_unlock(&dq_data_lock);
+ err = ocfs2_qinfo_lock(info, freeing);
+ if (err < 0) {
+ mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
+ " (type=%d, id=%u)\n", dquot->dq_id.type,
+ (unsigned)from_kqid(&init_user_ns, dquot->dq_id));
+ goto out;
+ }
+ if (freeing)
+ OCFS2_DQUOT(dquot)->dq_use_count--;
+ err = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (err < 0)
+ goto out_qlock;
+ if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+ err = qtree_release_dquot(&info->dqi_gi, dquot);
+ if (info_dirty(sb_dqinfo(sb, type))) {
+ err2 = __ocfs2_global_write_info(sb, type);
+ if (!err)
+ err = err2;
+ }
+ }
+out_qlock:
+ ocfs2_qinfo_unlock(info, freeing);
+out:
+ if (err < 0)
+ mlog_errno(err);
+ return err;
+}
+
+/*
+ * Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+ handle_t *handle;
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int status = 0;
+
+ trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type,
+ type, sb->s_id);
+ if (type != dquot->dq_id.type)
+ goto out;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ status = ocfs2_sync_dquot(dquot);
+ if (status < 0)
+ mlog_errno(status);
+ /* We have to write local structure as well... */
+ status = ocfs2_local_write_dquot(dquot);
+ if (status < 0)
+ mlog_errno(status);
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+ struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+ struct ocfs2_mem_dqinfo,
+ dqi_sync_work.work);
+ struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+ dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ schedule_delayed_work(&oinfo->dqi_sync_work,
+ msecs_to_jiffies(oinfo->dqi_syncms));
+}
+
+/*
+ * Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type);
+
+ handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ status = ocfs2_local_write_dquot(dquot);
+ mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ ocfs2_commit_trans(osb, handle);
+out:
+ return status;
+}
+
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ /*
+ * We modify tree, leaf block, global info, local chunk header,
+ * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+ * accounts for inode update
+ */
+ return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+ OCFS2_QINFO_WRITE_CREDITS +
+ OCFS2_INODE_UPDATE_CREDITS;
+}
+
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dquot_drop_work);
+ struct llist_node *list;
+ struct ocfs2_dquot *odquot, *next_odquot;
+
+ list = llist_del_all(&osb->dquot_drop_list);
+ llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+ /* Drop the reference we acquired in ocfs2_dquot_release() */
+ dqput(&odquot->dq_dquot);
+ }
+}
+
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type);
+
+ mutex_lock(&dquot->dq_lock);
+ /* Check whether we are not racing with some other dqget() */
+ if (atomic_read(&dquot->dq_count) > 1)
+ goto out;
+ /* Running from downconvert thread? Postpone quota processing to wq */
+ if (current == osb->dc_task) {
+ /*
+ * Grab our own reference to dquot and queue it for delayed
+ * dropping. Quota code rechecks after calling
+ * ->release_dquot() and won't free dquot structure.
+ */
+ dqgrab(dquot);
+ /* First entry on list -> queue work */
+ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+ queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ goto out;
+ }
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ status = ocfs2_local_release_dquot(handle, dquot);
+ /*
+ * If we fail here, we cannot do much as global structure is
+ * already released. So just complain...
+ */
+ if (status < 0)
+ mlog_errno(status);
+ /*
+ * Clear dq_off so that we search for the structure in quota file next
+ * time we acquire it. The structure might be deleted and reallocated
+ * elsewhere by another node while our dquot structure is on freelist.
+ */
+ dquot->dq_off = 0;
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mutex_unlock(&dquot->dq_lock);
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+ int status = 0, err;
+ int ex = 0;
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int type = dquot->dq_id.type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct inode *gqinode = info->dqi_gqinode;
+ int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+ handle_t *handle;
+
+ trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ type);
+ mutex_lock(&dquot->dq_lock);
+ /*
+ * We need an exclusive lock, because we're going to update use count
+ * and instantiate possibly new dquot structure
+ */
+ status = ocfs2_lock_global_qf(info, 1);
+ if (status < 0)
+ goto out;
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_dq;
+ /*
+ * We always want to read dquot structure from disk because we don't
+ * know what happened with it while it was on freelist.
+ */
+ status = qtree_read_dquot(&info->dqi_gi, dquot);
+ ocfs2_qinfo_unlock(info, 0);
+ if (status < 0)
+ goto out_dq;
+
+ OCFS2_DQUOT(dquot)->dq_use_count++;
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ if (!dquot->dq_off) { /* No real quota entry? */
+ ex = 1;
+ /*
+ * Add blocks to quota file before we start a transaction since
+ * locking allocators ranks above a transaction start
+ */
+ WARN_ON(journal_current_handle());
+ status = ocfs2_extend_no_holes(gqinode, NULL,
+ i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
+ i_size_read(gqinode));
+ if (status < 0)
+ goto out_dq;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_global_qinit_credits(sb, type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto out_dq;
+ }
+ status = ocfs2_qinfo_lock(info, ex);
+ if (status < 0)
+ goto out_trans;
+ status = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (ex && info_dirty(sb_dqinfo(sb, type))) {
+ err = __ocfs2_global_write_info(sb, type);
+ if (!status)
+ status = err;
+ }
+ ocfs2_qinfo_unlock(info, ex);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_dq:
+ ocfs2_unlock_global_qf(info, 1);
+ if (status < 0)
+ goto out;
+
+ status = ocfs2_create_local_dquot(dquot);
+ if (status < 0)
+ goto out;
+ set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out:
+ mutex_unlock(&dquot->dq_lock);
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+ unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+ (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+ (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+ (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+ (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+ (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+ int sync = 0;
+ int status;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_id.type;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
+ type);
+
+ /* In case user set some limits, sync dquot immediately to global
+ * quota file so that information propagates quicker */
+ spin_lock(&dq_data_lock);
+ if (dquot->dq_flags & mask)
+ sync = 1;
+ spin_unlock(&dq_data_lock);
+ /* This is a slight hack but we can't afford getting global quota
+ * lock if we already have a transaction started. */
+ if (!sync || journal_current_handle()) {
+ status = ocfs2_write_dquot(dquot);
+ goto out;
+ }
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ status = ocfs2_sync_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_dlock;
+ }
+ /* Now write updated local dquot structure */
+ status = ocfs2_local_write_dquot(dquot);
+out_dlock:
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+ handle_t *handle;
+ int status = 0;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_commit_info(sb, type);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+ struct ocfs2_dquot *dquot =
+ kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+ if (!dquot)
+ return NULL;
+ return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+ kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+const struct dquot_operations ocfs2_quota_operations = {
+ /* We never make dquot dirty so .write_dquot is never called */
+ .acquire_dquot = ocfs2_acquire_dquot,
+ .release_dquot = ocfs2_release_dquot,
+ .mark_dirty = ocfs2_mark_dquot_dirty,
+ .write_info = ocfs2_write_info,
+ .alloc_dquot = ocfs2_alloc_dquot,
+ .destroy_dquot = ocfs2_destroy_dquot,
+};
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 00000000000..2001862bf2b
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1320 @@
+/*
+ * Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+#include "uptodate.h"
+#include "super.h"
+#include "ocfs2_trace.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+ return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+ sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+ return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+ OCFS2_QBLK_RESERVED_SPACE) << 3) /
+ ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+ return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+ /* 1 block for local quota file info, 1 block per chunk for chunk info */
+ return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+ return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+ ol_dqblk_block_off(sb, c, off);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+ return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+ return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return ((off >> sb->s_blocksize_bits) -
+ ol_quota_chunk_block(sb, c) - 1) * epb
+ + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+ sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+ void (*modify)(struct buffer_head *, void *), void *private)
+{
+ struct super_block *sb = inode->i_sb;
+ handle_t *handle;
+ int status;
+
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ return status;
+ }
+ lock_buffer(bh);
+ modify(bh, private);
+ unlock_buffer(bh);
+ ocfs2_journal_dirty(handle, bh);
+
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+ return 0;
+}
+
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+ ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested "
+ "to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+ return -EIO;
+ }
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+ ocfs2_validate_quota_block);
+ if (rc)
+ mlog_errno(rc);
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+ unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+ unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+ unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+ unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+ unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
+ struct buffer_head *bh = NULL;
+ struct inode *linode = sb_dqopt(sb)->files[type];
+ struct inode *ginode = NULL;
+ struct ocfs2_disk_dqheader *dqhead;
+ int status, ret = 0;
+
+ /* First check whether we understand local quota file */
+ status = ocfs2_read_quota_block(linode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+ type);
+ goto out_err;
+ }
+ dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+ if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+ mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+ " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+ lmagics[type], type);
+ goto out_err;
+ }
+ if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+ mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+ " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+ lversions[type], type);
+ goto out_err;
+ }
+ brelse(bh);
+ bh = NULL;
+
+ /* Next check whether we understand global quota file */
+ ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!ginode) {
+ mlog(ML_ERROR, "cannot get global quota file inode "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ /* Since the header is read only, we don't care about locking */
+ status = ocfs2_read_quota_block(ginode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read global quota file header "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+ if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+ mlog(ML_ERROR, "global quota file magic does not match "
+ "(%u != %u), type=%d\n",
+ le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+ goto out_err;
+ }
+ if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+ mlog(ML_ERROR, "global quota file version does not match "
+ "(%u != %u), type=%d\n",
+ le32_to_cpu(dqhead->dqh_version), gversions[type],
+ type);
+ goto out_err;
+ }
+
+ ret = 1;
+out_err:
+ brelse(bh);
+ iput(ginode);
+ return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+ struct ocfs2_quota_chunk *pos, *next;
+
+ list_for_each_entry_safe(pos, next, head, qc_chunk) {
+ list_del(&pos->qc_chunk);
+ brelse(pos->qc_headerbh);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+ }
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+ struct ocfs2_local_disk_dqinfo *ldinfo,
+ struct list_head *head)
+{
+ struct ocfs2_quota_chunk *newchunk;
+ int i, status;
+
+ INIT_LIST_HEAD(head);
+ for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+ newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+ if (!newchunk) {
+ ocfs2_release_local_quota_bitmaps(head);
+ return -ENOMEM;
+ }
+ newchunk->qc_num = i;
+ newchunk->qc_headerbh = NULL;
+ status = ocfs2_read_quota_block(inode,
+ ol_quota_chunk_block(inode->i_sb, i),
+ &newchunk->qc_headerbh);
+ if (status) {
+ mlog_errno(status);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+ ocfs2_release_local_quota_bitmaps(head);
+ return status;
+ }
+ list_add_tail(&newchunk->qc_chunk, head);
+ }
+ return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+ struct mem_dqinfo *info = private;
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ spin_lock(&dq_data_lock);
+ ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+ ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+ spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+ struct ocfs2_local_disk_chunk *dchunk,
+ int chunk,
+ struct list_head *head)
+{
+ struct ocfs2_recovery_chunk *rc;
+
+ rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+ if (!rc)
+ return -ENOMEM;
+ rc->rc_chunk = chunk;
+ rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!rc->rc_bitmap) {
+ kfree(rc);
+ return -ENOMEM;
+ }
+ memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+ (ol_chunk_entries(sb) + 7) >> 3);
+ list_add_tail(&rc->rc_list, head);
+ return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+ struct ocfs2_recovery_chunk *next;
+ struct ocfs2_recovery_chunk *rchunk;
+
+ list_for_each_entry_safe(rchunk, next, head, rc_list) {
+ list_del(&rchunk->rc_list);
+ kfree(rchunk->rc_bitmap);
+ kfree(rchunk);
+ }
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ free_recovery_list(&(rec->r_list[type]));
+ kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+ struct ocfs2_local_disk_dqinfo *ldinfo,
+ int type,
+ struct list_head *head)
+{
+ struct super_block *sb = lqinode->i_sb;
+ struct buffer_head *hbh;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+ int status = 0;
+
+ for (i = 0; i < chunks; i++) {
+ hbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_quota_chunk_block(sb, i),
+ &hbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+ if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+ status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+ brelse(hbh);
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ free_recovery_list(head);
+ return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+ int type;
+ struct ocfs2_quota_recovery *rec;
+
+ rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+ if (!rec)
+ return NULL;
+ for (type = 0; type < MAXQUOTAS; type++)
+ INIT_LIST_HEAD(&(rec->r_list[type]));
+ return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+ struct ocfs2_super *osb,
+ int slot_num)
+{
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ struct super_block *sb = osb->sb;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct inode *lqinode;
+ struct buffer_head *bh;
+ int type;
+ int status = 0;
+ struct ocfs2_quota_recovery *rec;
+
+ printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
+ rec = ocfs2_alloc_quota_recovery();
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+ /* First init... */
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ /* At this point, journal of the slot is already replayed so
+ * we can trust metadata and data of the quota file */
+ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+ if (!lqinode) {
+ status = -ENOENT;
+ goto out;
+ }
+ status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+ OCFS2_META_LOCK_RECOVERY);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_put;
+ }
+ /* Now read local header */
+ bh = NULL;
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(slot=%d type=%d)\n", slot_num, type);
+ goto out_lock;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+ &rec->r_list[type]);
+ brelse(bh);
+out_lock:
+ ocfs2_inode_unlock(lqinode, 1);
+out_put:
+ iput(lqinode);
+ if (status < 0)
+ break;
+ }
+out:
+ if (status < 0) {
+ ocfs2_free_quota_recovery(rec);
+ rec = ERR_PTR(status);
+ }
+ return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+ int type,
+ struct ocfs2_quota_recovery *rec)
+{
+ struct super_block *sb = lqinode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_local_disk_chunk *dchunk;
+ struct ocfs2_local_disk_dqblk *dqblk;
+ struct dquot *dquot;
+ handle_t *handle;
+ struct buffer_head *hbh = NULL, *qbh = NULL;
+ int status = 0;
+ int bit, chunk;
+ struct ocfs2_recovery_chunk *rchunk, *next;
+ qsize_t spacechange, inodechange;
+
+ trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
+
+ list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+ chunk = rchunk->rc_chunk;
+ hbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_quota_chunk_block(sb, chunk),
+ &hbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+ for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+ qbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_dqblk_block(sb, chunk, bit),
+ &qbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+ ol_dqblk_block_off(sb, chunk, bit));
+ dquot = dqget(sb,
+ make_kqid(&init_user_ns, type,
+ le64_to_cpu(dqblk->dqb_id)));
+ if (!dquot) {
+ status = -EIO;
+ mlog(ML_ERROR, "Failed to get quota structure "
+ "for id %u, type %d. Cannot finish quota "
+ "file recovery.\n",
+ (unsigned)le64_to_cpu(dqblk->dqb_id),
+ type);
+ goto out_put_bh;
+ }
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_put_dquot;
+ }
+
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_drop_lock;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ spin_lock(&dq_data_lock);
+ /* Add usage from quota entry into quota changes
+ * of our node. Auxiliary variables are important
+ * due to signedness */
+ spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+ inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+ dquot->dq_dqb.dqb_curspace += spacechange;
+ dquot->dq_dqb.dqb_curinodes += inodechange;
+ spin_unlock(&dq_data_lock);
+ /* We want to drop reference held by the crashed
+ * node. Since we have our own reference we know
+ * global structure actually won't be freed. */
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+ /* Release local quota file entry */
+ status = ocfs2_journal_access_dq(handle,
+ INODE_CACHE(lqinode),
+ qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+ lock_buffer(qbh);
+ WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, 1);
+ unlock_buffer(qbh);
+ ocfs2_journal_dirty(handle, qbh);
+out_commit:
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out_put_dquot:
+ dqput(dquot);
+out_put_bh:
+ brelse(qbh);
+ if (status < 0)
+ break;
+ }
+ brelse(hbh);
+ list_del(&rchunk->rc_list);
+ kfree(rchunk->rc_bitmap);
+ kfree(rchunk);
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ free_recovery_list(&(rec->r_list[type]));
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+ struct ocfs2_quota_recovery *rec,
+ int slot_num)
+{
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ struct super_block *sb = osb->sb;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct buffer_head *bh;
+ handle_t *handle;
+ int type;
+ int status = 0;
+ struct inode *lqinode;
+ unsigned int flags;
+
+ printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (list_empty(&(rec->r_list[type])))
+ continue;
+ trace_ocfs2_finish_quota_recovery(slot_num);
+ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+ if (!lqinode) {
+ status = -ENOENT;
+ goto out;
+ }
+ status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+ OCFS2_META_LOCK_NOQUEUE);
+ /* Someone else is holding the lock? Then he must be
+ * doing the recovery. Just skip the file... */
+ if (status == -EAGAIN) {
+ printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+ "device (%s) for slot %d because quota file is "
+ "locked.\n", osb->dev_str, slot_num);
+ status = 0;
+ goto out_put;
+ } else if (status < 0) {
+ mlog_errno(status);
+ goto out_put;
+ }
+ /* Now read local header */
+ bh = NULL;
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(slot=%d type=%d)\n", slot_num, type);
+ goto out_lock;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ /* Is recovery still needed? */
+ flags = le32_to_cpu(ldinfo->dqi_flags);
+ if (!(flags & OLQF_CLEAN))
+ status = ocfs2_recover_local_quota_file(lqinode,
+ type,
+ rec);
+ /* We don't want to mark file as clean when it is actually
+ * active */
+ if (slot_num == osb->slot_num)
+ goto out_bh;
+ /* Mark quota file as clean if we are recovering quota file of
+ * some other node. */
+ handle = ocfs2_start_trans(osb,
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_bh;
+ }
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+ bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+ unlock_buffer(bh);
+ ocfs2_journal_dirty(handle, bh);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_bh:
+ brelse(bh);
+out_lock:
+ ocfs2_inode_unlock(lqinode, 1);
+out_put:
+ iput(lqinode);
+ if (status < 0)
+ break;
+ }
+out:
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ kfree(rec);
+ return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ int status;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_quota_recovery *rec;
+ int locked = 0;
+
+ /* We don't need the lock and we have to acquire quota file locks
+ * which will later depend on this lock */
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ info->dqi_maxblimit = 0x7fffffffffffffffLL;
+ info->dqi_maxilimit = 0x7fffffffffffffffLL;
+ oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+ if (!oinfo) {
+ mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+ " info.");
+ goto out_err;
+ }
+ info->dqi_priv = oinfo;
+ oinfo->dqi_type = type;
+ INIT_LIST_HEAD(&oinfo->dqi_chunk);
+ oinfo->dqi_rec = NULL;
+ oinfo->dqi_lqi_bh = NULL;
+ oinfo->dqi_libh = NULL;
+
+ status = ocfs2_global_read_info(sb, type);
+ if (status < 0)
+ goto out_err;
+
+ status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ locked = 1;
+
+ /* Now read local header */
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+ oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+ oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+ oinfo->dqi_libh = bh;
+
+ /* We crashed when using local quota file? */
+ if (!(info->dqi_flags & OLQF_CLEAN)) {
+ rec = OCFS2_SB(sb)->quota_rec;
+ if (!rec) {
+ rec = ocfs2_alloc_quota_recovery();
+ if (!rec) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+ OCFS2_SB(sb)->quota_rec = rec;
+ }
+
+ status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+ &rec->r_list[type]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ }
+
+ status = ocfs2_load_local_quota_bitmaps(lqinode,
+ ldinfo,
+ &oinfo->dqi_chunk);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ /* Now mark quota file as used */
+ info->dqi_flags &= ~OLQF_CLEAN;
+ status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ return 0;
+out_err:
+ if (oinfo) {
+ iput(oinfo->dqi_gqinode);
+ ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+ ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+ brelse(oinfo->dqi_lqi_bh);
+ if (locked)
+ ocfs2_inode_unlock(lqinode, 1);
+ ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+ kfree(oinfo);
+ }
+ brelse(bh);
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+ ->dqi_libh;
+ int status;
+
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+ info);
+ if (status < 0) {
+ mlog_errno(status);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int mark_clean = 1, len;
+ int status;
+
+ iput(oinfo->dqi_gqinode);
+ ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+ ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+ list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ (chunk->qc_headerbh->b_data);
+ if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+ len = ol_chunk_entries(sb);
+ } else {
+ len = (oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+ * ol_quota_entries_per_block(sb);
+ }
+ /* Not all entries free? Bug! */
+ if (le32_to_cpu(dchunk->dqc_free) != len) {
+ mlog(ML_ERROR, "releasing quota file with used "
+ "entries (type=%d)\n", type);
+ mark_clean = 0;
+ }
+ }
+ ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+ /* dqonoff_mutex protects us against racing with recovery thread... */
+ if (oinfo->dqi_rec) {
+ ocfs2_free_quota_recovery(oinfo->dqi_rec);
+ mark_clean = 0;
+ }
+
+ if (!mark_clean)
+ goto out;
+
+ /* Mark local file as clean */
+ info->dqi_flags |= OLQF_CLEAN;
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+ oinfo->dqi_libh,
+ olq_update_info,
+ info);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+out:
+ ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+ brelse(oinfo->dqi_libh);
+ brelse(oinfo->dqi_lqi_bh);
+ kfree(oinfo);
+ return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+ struct ocfs2_dquot *od = private;
+ struct ocfs2_local_disk_dqblk *dqblk;
+ struct super_block *sb = od->dq_dquot.dq_sb;
+
+ dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+ + ol_dqblk_block_offset(sb, od->dq_local_off));
+
+ dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
+ od->dq_dquot.dq_id));
+ spin_lock(&dq_data_lock);
+ dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+ od->dq_origspace);
+ dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+ od->dq_originodes);
+ spin_unlock(&dq_data_lock);
+ trace_olq_set_dquot(
+ (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+ (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+ from_kqid(&init_user_ns, od->dq_dquot.dq_id));
+}
+
+/* Write dquot to local quota file */
+int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ struct buffer_head *bh;
+ struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type];
+ int status;
+
+ status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
+ &bh);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+out:
+ brelse(bh);
+ return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int found = 0, len;
+
+ list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ chunk->qc_headerbh->b_data;
+ if (le32_to_cpu(dchunk->dqc_free) > 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return NULL;
+
+ if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+ len = ol_chunk_entries(sb);
+ } else {
+ len = (oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+ * ol_quota_entries_per_block(sb);
+ }
+
+ found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
+ /* We failed? */
+ if (found == len) {
+ mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+ " entries free (type=%d)\n", chunk->qc_num,
+ le32_to_cpu(dchunk->dqc_free), type);
+ return ERR_PTR(-EIO);
+ }
+ *offset = found;
+ return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+ struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_quota_chunk *chunk = NULL;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int status;
+ handle_t *handle;
+ struct buffer_head *bh = NULL, *dbh = NULL;
+ u64 p_blkno;
+
+ /* We are protected by dqio_sem so no locking needed */
+ status = ocfs2_extend_no_holes(lqinode, NULL,
+ i_size_read(lqinode) + 2 * sb->s_blocksize,
+ i_size_read(lqinode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+ i_size_read(lqinode) + 2 * sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+ if (!chunk) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+ /* Local quota info and two new blocks we initialize */
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Initialize chunk header */
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+ &p_blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ bh = sb_getblk(sb, p_blkno);
+ if (!bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_trans;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+ memset(dchunk->dqc_bitmap, 0,
+ sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+ OCFS2_QBLK_RESERVED_SPACE);
+ unlock_buffer(bh);
+ ocfs2_journal_dirty(handle, bh);
+
+ /* Initialize new block with structures */
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+ &p_blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ dbh = sb_getblk(sb, p_blkno);
+ if (!dbh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_trans;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(dbh);
+ memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+ unlock_buffer(dbh);
+ ocfs2_journal_dirty(handle, dbh);
+
+ /* Update local quotafile info */
+ oinfo->dqi_blocks += 2;
+ oinfo->dqi_chunks++;
+ status = ocfs2_local_write_info(sb, type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+ chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+ struct ocfs2_quota_chunk,
+ qc_chunk)->qc_num + 1;
+ chunk->qc_headerbh = bh;
+ *offset = 0;
+ return chunk;
+out_trans:
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ brelse(bh);
+ brelse(dbh);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+ return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+ struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_local_disk_chunk *dchunk;
+ int epb = ol_quota_entries_per_block(sb);
+ unsigned int chunk_blocks;
+ struct buffer_head *bh;
+ u64 p_blkno;
+ int status;
+ handle_t *handle;
+
+ if (list_empty(&oinfo->dqi_chunk))
+ return ocfs2_local_quota_add_chunk(sb, type, offset);
+ /* Is the last chunk full? */
+ chunk = list_entry(oinfo->dqi_chunk.prev,
+ struct ocfs2_quota_chunk, qc_chunk);
+ chunk_blocks = oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+ if (ol_chunk_blocks(sb) == chunk_blocks)
+ return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+ /* We are protected by dqio_sem so no locking needed */
+ status = ocfs2_extend_no_holes(lqinode, NULL,
+ i_size_read(lqinode) + sb->s_blocksize,
+ i_size_read(lqinode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+ i_size_read(lqinode) + sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Get buffer from the just added block */
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+ &p_blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ bh = sb_getblk(sb, p_blkno);
+ if (!bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+
+ /* Local quota info, chunk header and the new block we initialize */
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ /* Zero created block */
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ unlock_buffer(bh);
+ ocfs2_journal_dirty(handle, bh);
+
+ /* Update chunk header */
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+ chunk->qc_headerbh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+ lock_buffer(chunk->qc_headerbh);
+ le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+ unlock_buffer(chunk->qc_headerbh);
+ ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
+ /* Update file header */
+ oinfo->dqi_blocks++;
+ status = ocfs2_local_write_info(sb, type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ *offset = chunk_blocks * epb;
+ return chunk;
+out_trans:
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+ int *offset = private;
+ struct ocfs2_local_disk_chunk *dchunk;
+
+ dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+ ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_id.type;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ int offset;
+ int status;
+ u64 pcount;
+
+ down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
+ chunk = ocfs2_find_free_entry(sb, type, &offset);
+ if (!chunk) {
+ chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+ if (IS_ERR(chunk)) {
+ status = PTR_ERR(chunk);
+ goto out;
+ }
+ } else if (IS_ERR(chunk)) {
+ status = PTR_ERR(chunk);
+ goto out;
+ }
+ od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+ od->dq_chunk = chunk;
+ status = ocfs2_extent_map_get_blocks(lqinode,
+ ol_dqblk_block(sb, chunk->qc_num, offset),
+ &od->dq_local_phys_blk,
+ &pcount,
+ NULL);
+
+ /* Initialize dquot structure on disk */
+ status = ocfs2_local_write_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Mark structure as allocated */
+ status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+ &offset);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+out:
+ up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
+ return status;
+}
+
+/*
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and written all changes to global quota file
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
+{
+ int status;
+ int type = dquot->dq_id.type;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int offset;
+
+ status = ocfs2_journal_access_dq(handle,
+ INODE_CACHE(sb_dqopt(sb)->files[type]),
+ od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+ od->dq_local_off);
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ (od->dq_chunk->qc_headerbh->b_data);
+ /* Mark structure as freed */
+ lock_buffer(od->dq_chunk->qc_headerbh);
+ ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, 1);
+ unlock_buffer(od->dq_chunk->qc_headerbh);
+ ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
+out:
+ return status;
+}
+
+static const struct quota_format_ops ocfs2_format_ops = {
+ .check_quota_file = ocfs2_local_check_quota_file,
+ .read_file_info = ocfs2_local_read_info,
+ .write_file_info = ocfs2_global_write_info,
+ .free_file_info = ocfs2_local_free_info,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+ .qf_fmt_id = QFMT_OCFS2,
+ .qf_ops = &ocfs2_format_ops,
+ .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 00000000000..636aab69ead
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4476 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sort.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "aops.h"
+#include "xattr.h"
+#include "namei.h"
+#include "ocfs2_trace.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/posix_acl.h>
+
+struct ocfs2_cow_context {
+ struct inode *inode;
+ u32 cow_start;
+ u32 cow_len;
+ struct ocfs2_extent_tree data_et;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ void *cow_object;
+ struct ocfs2_post_refcount *post_refcount;
+ int extra_credits;
+ int (*get_clusters)(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags);
+ int (*cow_duplicate_clusters)(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+};
+
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+ return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)bh->b_data;
+
+ trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return rc;
+ }
+
+
+ if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has an invalid "
+ "rf_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+ u64 rb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+ ocfs2_validate_refcount_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+ .co_owner = ocfs2_refcount_cache_owner,
+ .co_get_super = ocfs2_refcount_cache_get_super,
+ .co_cache_lock = ocfs2_refcount_cache_lock,
+ .co_cache_unlock = ocfs2_refcount_cache_unlock,
+ .co_io_lock = ocfs2_refcount_cache_io_lock,
+ .co_io_unlock = ocfs2_refcount_cache_io_unlock,
+};
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+ struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+ struct ocfs2_refcount_tree *tree = NULL;
+
+ while (n) {
+ tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+ if (blkno < tree->rf_blkno)
+ n = n->rb_left;
+ else if (blkno > tree->rf_blkno)
+ n = n->rb_right;
+ else
+ return tree;
+ }
+
+ return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *new)
+{
+ u64 rf_blkno = new->rf_blkno;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+ struct ocfs2_refcount_tree *tmp;
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+ rf_node);
+
+ if (rf_blkno < tmp->rf_blkno)
+ p = &(*p)->rb_left;
+ else if (rf_blkno > tmp->rf_blkno)
+ p = &(*p)->rb_right;
+ else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+ (unsigned long long)rf_blkno);
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->rf_node, parent, p);
+ rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
+{
+ ocfs2_metadata_cache_exit(&tree->rf_ci);
+ ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
+ ocfs2_lock_res_free(&tree->rf_lockres);
+ kfree(tree);
+}
+
+static inline void
+ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree)
+{
+ rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+ if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
+ osb->osb_ref_tree_lru = NULL;
+}
+
+static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree)
+{
+ spin_lock(&osb->osb_lock);
+ ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+ spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+{
+ struct ocfs2_refcount_tree *tree =
+ container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
+
+ ocfs2_free_refcount_tree(tree);
+}
+
+static inline void
+ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
+{
+ kref_get(&tree->rf_getcnt);
+}
+
+static inline void
+ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
+{
+ kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
+}
+
+static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
+ struct super_block *sb)
+{
+ ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
+ mutex_init(&new->rf_io_mutex);
+ new->rf_sb = sb;
+ spin_lock_init(&new->rf_lock);
+}
+
+static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *new,
+ u64 rf_blkno, u32 generation)
+{
+ init_rwsem(&new->rf_sem);
+ ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
+ rf_blkno, generation);
+}
+
+static struct ocfs2_refcount_tree*
+ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
+{
+ struct ocfs2_refcount_tree *new;
+
+ new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+ if (!new)
+ return NULL;
+
+ new->rf_blkno = rf_blkno;
+ kref_init(&new->rf_getcnt);
+ ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+ return new;
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+ struct ocfs2_refcount_tree **ret_tree)
+{
+ int ret = 0;
+ struct ocfs2_refcount_tree *tree, *new = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *ref_rb;
+
+ spin_lock(&osb->osb_lock);
+ if (osb->osb_ref_tree_lru &&
+ osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+ tree = osb->osb_ref_tree_lru;
+ else
+ tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+ if (tree)
+ goto out;
+
+ spin_unlock(&osb->osb_lock);
+
+ new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
+ if (!new) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+ /*
+ * We need the generation to create the refcount tree lock and since
+ * it isn't changed during the tree modification, we are safe here to
+ * read without protection.
+ * We also have to purge the cache after we create the lock since the
+ * refcount block may have the stale data. It can only be trusted when
+ * we hold the refcount lock.
+ */
+ ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_metadata_cache_exit(&new->rf_ci);
+ kfree(new);
+ return ret;
+ }
+
+ ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
+ ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
+ new->rf_generation);
+ ocfs2_metadata_cache_purge(&new->rf_ci);
+
+ spin_lock(&osb->osb_lock);
+ tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+ if (tree)
+ goto out;
+
+ ocfs2_insert_refcount_tree(osb, new);
+
+ tree = new;
+ new = NULL;
+
+out:
+ *ret_tree = tree;
+
+ osb->osb_ref_tree_lru = tree;
+
+ spin_unlock(&osb->osb_lock);
+
+ if (new)
+ ocfs2_free_refcount_tree(new);
+
+ brelse(ref_root_bh);
+ return ret;
+}
+
+static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ *ref_blkno = le64_to_cpu(di->i_refcount_loc);
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree, int rw)
+{
+ int ret;
+
+ ret = ocfs2_refcount_lock(tree, rw);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rw)
+ down_write(&tree->rf_sem);
+ else
+ down_read(&tree->rf_sem);
+
+out:
+ return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really needs it.
+ *
+ * If the tree has been re-created by other node, it will free the
+ * old one and re-create it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+ u64 ref_blkno, int rw,
+ struct ocfs2_refcount_tree **ret_tree,
+ struct buffer_head **ref_bh)
+{
+ int ret, delete_tree = 0;
+ struct ocfs2_refcount_tree *tree = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+
+again:
+ ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_refcount_tree_get(tree);
+
+ ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_refcount_tree_put(tree);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_unlock_refcount_tree(osb, tree, rw);
+ ocfs2_refcount_tree_put(tree);
+ goto out;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ /*
+ * If the refcount block has been freed and re-created, we may need
+ * to recreate the refcount tree also.
+ *
+ * Here we just remove the tree from the rb-tree, and the last
+ * kref holder will unlock and delete this refcount_tree.
+ * Then we goto "again" and ocfs2_get_refcount_tree will create
+ * the new refcount tree for us.
+ */
+ if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
+ if (!tree->rf_removed) {
+ ocfs2_erase_refcount_tree_from_list(osb, tree);
+ tree->rf_removed = 1;
+ delete_tree = 1;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, tree, rw);
+ /*
+ * We get an extra reference when we create the refcount
+ * tree, so another put will destroy it.
+ */
+ if (delete_tree)
+ ocfs2_refcount_tree_put(tree);
+ brelse(ref_root_bh);
+ ref_root_bh = NULL;
+ goto again;
+ }
+
+ *ret_tree = tree;
+ if (ref_bh) {
+ *ref_bh = ref_root_bh;
+ ref_root_bh = NULL;
+ }
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree, int rw)
+{
+ if (rw)
+ up_write(&tree->rf_sem);
+ else
+ up_read(&tree->rf_sem);
+
+ ocfs2_refcount_unlock(tree, rw);
+ ocfs2_refcount_tree_put(tree);
+}
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
+{
+ struct rb_node *node;
+ struct ocfs2_refcount_tree *tree;
+ struct rb_root *root = &osb->osb_rf_lock_tree;
+
+ while ((node = rb_last(root)) != NULL) {
+ tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+ trace_ocfs2_purge_refcount_trees(
+ (unsigned long long) tree->rf_blkno);
+
+ rb_erase(&tree->rf_node, root);
+ ocfs2_free_refcount_tree(tree);
+ }
+}
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+static int ocfs2_create_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 suballoc_loc, first_blkno;
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+ trace_ocfs2_create_refcount_tree(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &suballoc_bit_start, &num_got,
+ &first_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
+ if (!new_tree) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Initialize ocfs2_refcount_block. */
+ rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ memset(rb, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+ rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+ rb->rf_blkno = cpu_to_le64(first_blkno);
+ rb->rf_count = cpu_to_le32(1);
+ rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+ spin_lock(&osb->osb_lock);
+ rb->rf_generation = osb->s_next_generation++;
+ spin_unlock(&osb->osb_lock);
+
+ ocfs2_journal_dirty(handle, new_bh);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = cpu_to_le64(first_blkno);
+ spin_unlock(&oi->ip_lock);
+
+ trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ /*
+ * We have to init the tree lock here since it will use
+ * the generation number to create it.
+ */
+ new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
+ ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
+ new_tree->rf_generation);
+
+ spin_lock(&osb->osb_lock);
+ tree = ocfs2_find_refcount_tree(osb, first_blkno);
+
+ /*
+ * We've just created a new refcount tree in this block. If
+ * we found a refcount tree on the ocfs2_super, it must be
+ * one we just deleted. We free the old tree before
+ * inserting the new tree.
+ */
+ BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
+ if (tree)
+ ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+ ocfs2_insert_refcount_tree(osb, new_tree);
+ spin_unlock(&osb->osb_lock);
+ new_tree = NULL;
+ if (tree)
+ ocfs2_refcount_tree_put(tree);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (new_tree) {
+ ocfs2_metadata_cache_exit(&new_tree->rf_ci);
+ kfree(new_tree);
+ }
+
+ brelse(new_bh);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+static int ocfs2_set_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 refcount_loc)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_tree *ref_tree;
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ le32_add_cpu(&rb->rf_count, 1);
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = cpu_to_le64(refcount_loc);
+ spin_unlock(&oi->ip_lock);
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+
+ return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+ int ret, delete_tree = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+ struct inode *alloc_inode = NULL;
+ struct buffer_head *alloc_bh = NULL;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
+ u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
+ u16 bit = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+ return 0;
+
+ BUG_ON(!ref_blkno);
+ ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+ /*
+ * If we are the last user, we need to free the block.
+ * So lock the allocator ahead.
+ */
+ if (le32_to_cpu(rb->rf_count) == 1) {
+ blk = le64_to_cpu(rb->rf_blkno);
+ bit = le16_to_cpu(rb->rf_suballoc_bit);
+ if (rb->rf_suballoc_loc)
+ bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+ alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(rb->rf_suballoc_slot));
+ if (!alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ credits += OCFS2_SUBALLOC_FREE;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = 0;
+ spin_unlock(&oi->ip_lock);
+ ocfs2_journal_dirty(handle, di_bh);
+
+ le32_add_cpu(&rb->rf_count , -1);
+ ocfs2_journal_dirty(handle, blk_bh);
+
+ if (!rb->rf_count) {
+ delete_tree = 1;
+ ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
+ ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+ alloc_bh, bit, bg_blkno, 1);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ if (alloc_inode) {
+ ocfs2_inode_unlock(alloc_inode, 1);
+ brelse(alloc_bh);
+ }
+out_mutex:
+ if (alloc_inode) {
+ mutex_unlock(&alloc_inode->i_mutex);
+ iput(alloc_inode);
+ }
+out:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ if (delete_tree)
+ ocfs2_refcount_tree_put(ref_tree);
+ brelse(blk_bh);
+
+ return ret;
+}
+
+static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_leaf_bh,
+ u64 cpos, unsigned int len,
+ struct ocfs2_refcount_rec *ret_rec,
+ int *index)
+{
+ int i = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_rec *rec = NULL;
+
+ for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+ rec = &rb->rf_records.rl_recs[i];
+
+ if (le64_to_cpu(rec->r_cpos) +
+ le32_to_cpu(rec->r_clusters) <= cpos)
+ continue;
+ else if (le64_to_cpu(rec->r_cpos) > cpos)
+ break;
+
+ /* ok, cpos fail in this rec. Just return. */
+ if (ret_rec)
+ *ret_rec = *rec;
+ goto out;
+ }
+
+ if (ret_rec) {
+ /* We meet with a hole here, so fake the rec. */
+ ret_rec->r_cpos = cpu_to_le64(cpos);
+ ret_rec->r_refcount = 0;
+ if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+ le64_to_cpu(rec->r_cpos) < cpos + len)
+ ret_rec->r_clusters =
+ cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+ else
+ ret_rec->r_clusters = cpu_to_le32(len);
+ }
+
+out:
+ *index = i;
+}
+
+/*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&oi->ip_xattr_sem);
+ down_write(&oi->ip_alloc_sem);
+
+ if (oi->ip_clusters)
+ goto out;
+
+ if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
+ goto out;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+ ocfs2_has_inline_xattr_value_outside(inode, di))
+ goto out;
+
+ ret = ocfs2_remove_refcount_tree(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+out:
+ up_write(&oi->ip_alloc_sem);
+ up_write(&oi->ip_xattr_sem);
+ return 0;
+}
+
+/*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_extent_block *eb,
+ struct ocfs2_extent_list *el,
+ int index, u32 *cpos_end)
+{
+ int ret, i, subtree_root;
+ u32 cpos;
+ u64 blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_extent_list *tmp_el;
+
+ if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+ /*
+ * We have a extent rec after index, so just use the e_cpos
+ * of the next extent rec.
+ */
+ *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+ return 0;
+ }
+
+ if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ /*
+ * We are the last extent rec, so any high cpos should
+ * be stored in this leaf refcount block.
+ */
+ *cpos_end = UINT_MAX;
+ return 0;
+ }
+
+ /*
+ * If the extent block isn't the last one, we have to find
+ * the subtree root between this extent block and the next
+ * leaf extent block and get the corresponding e_cpos from
+ * the subroot. Otherwise we may corrupt the b-tree.
+ */
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ left_path = ocfs2_new_path_from_et(&et);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+ ret = ocfs2_find_path(ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ right_path = ocfs2_new_path_from_path(left_path);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(&et, left_path,
+ right_path);
+
+ tmp_el = left_path->p_node[subtree_root].el;
+ blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+ for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
+ if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+ *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+ break;
+ }
+ }
+
+ BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ * and end at a small value between cpos+len and start of the next record.
+ * This fake record has r_refcount = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, unsigned int len,
+ struct ocfs2_refcount_rec *ret_rec,
+ int *index,
+ struct buffer_head **ret_bh)
+{
+ int ret = 0, i, found;
+ u32 low_cpos, uninitialized_var(cpos_end);
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_block *eb = NULL;
+ struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+ ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
+ ret_rec, index);
+ *ret_bh = ref_root_bh;
+ get_bh(ref_root_bh);
+ return 0;
+ }
+
+ el = &rb->rf_list;
+ low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(sb,
+ "refcount tree %llu has non zero tree "
+ "depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ found = 0;
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+ eb, el, i, &cpos_end);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (cpos_end < low_cpos + len)
+ len = cpos_end - low_cpos;
+ }
+
+ ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
+ ret_rec, index);
+ *ret_bh = ref_leaf_bh;
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+ REF_CONTIG_NONE = 0,
+ REF_CONTIG_LEFT,
+ REF_CONTIG_RIGHT,
+ REF_CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_ref_rec_contig
+ ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ if ((rb->rf_records.rl_recs[index].r_refcount ==
+ rb->rf_records.rl_recs[index + 1].r_refcount) &&
+ (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+ le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
+ le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+ return REF_CONTIG_RIGHT;
+
+ return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+ ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+ if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+ ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+ if (index > 0) {
+ enum ocfs2_ref_rec_contig tmp;
+
+ tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+ if (tmp == REF_CONTIG_RIGHT) {
+ if (ret == REF_CONTIG_RIGHT)
+ ret = REF_CONTIG_LEFTRIGHT;
+ else
+ ret = REF_CONTIG_LEFT;
+ }
+ }
+
+ return ret;
+}
+
+static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
+ rb->rf_records.rl_recs[index+1].r_refcount);
+
+ le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+ le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+ if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+ memmove(&rb->rf_records.rl_recs[index + 1],
+ &rb->rf_records.rl_recs[index + 2],
+ sizeof(struct ocfs2_refcount_rec) *
+ (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+ memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
+ 0, sizeof(struct ocfs2_refcount_rec));
+ le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ enum ocfs2_ref_rec_contig contig =
+ ocfs2_refcount_rec_contig(rb, index);
+
+ if (contig == REF_CONTIG_NONE)
+ return;
+
+ if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+ BUG_ON(index == 0);
+ index--;
+ }
+
+ ocfs2_rotate_refcount_rec_left(rb, index);
+
+ if (contig == REF_CONTIG_LEFTRIGHT)
+ ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
+static int ocfs2_change_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_leaf_bh,
+ int index, int merge, int change)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rl = &rb->rf_records;
+ struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_change_refcount_rec(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ index, le32_to_cpu(rec->r_refcount), change);
+ le32_add_cpu(&rec->r_refcount, change);
+
+ if (!rec->r_refcount) {
+ if (index != le16_to_cpu(rl->rl_used) - 1) {
+ memmove(rec, rec + 1,
+ (le16_to_cpu(rl->rl_used) - index - 1) *
+ sizeof(struct ocfs2_refcount_rec));
+ memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+ 0, sizeof(struct ocfs2_refcount_rec));
+ }
+
+ le16_add_cpu(&rl->rl_used, -1);
+ } else if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+out:
+ return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head **ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 suballoc_loc, blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *new_rb;
+ struct ocfs2_refcount_block *root_rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &suballoc_bit_start, &num_got,
+ &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_bh = sb_getblk(sb, blkno);
+ if (new_bh == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Initialize ocfs2_refcount_block.
+ * It should contain the same information as the old root.
+ * so just memcpy it and change the corresponding field.
+ */
+ memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+ new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ new_rb->rf_blkno = cpu_to_le64(blkno);
+ new_rb->rf_cpos = cpu_to_le32(0);
+ new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+ new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ /* Now change the root. */
+ memset(&root_rb->rf_list, 0, sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_list));
+ root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+ root_rb->rf_clusters = cpu_to_le32(1);
+ root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+ root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+ root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+ root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+ trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+ le16_to_cpu(new_rb->rf_records.rl_used));
+
+ *ref_leaf_bh = new_bh;
+ new_bh = NULL;
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+ struct ocfs2_refcount_rec *next)
+{
+ if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
+ ocfs2_get_ref_rec_low_cpos(next))
+ return 1;
+
+ return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+ const struct ocfs2_refcount_rec *l = a, *r = b;
+ u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+ u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+ if (l_cpos > r_cpos)
+ return 1;
+ if (l_cpos < r_cpos)
+ return -1;
+ return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+ const struct ocfs2_refcount_rec *l = a, *r = b;
+ u64 l_cpos = le64_to_cpu(l->r_cpos);
+ u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+ if (l_cpos > r_cpos)
+ return 1;
+ if (l_cpos < r_cpos)
+ return -1;
+ return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+ struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+ tmp = *l;
+ *l = *r;
+ *r = tmp;
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ * So just try the middle pos first, and we will exit when we find
+ * the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+ u32 *split_pos, int *split_index)
+{
+ int num_used = le16_to_cpu(rl->rl_used);
+ int delta, middle = num_used / 2;
+
+ for (delta = 0; delta < middle; delta++) {
+ /* Let's check delta earlier than middle */
+ if (ocfs2_refcount_rec_no_intersect(
+ &rl->rl_recs[middle - delta - 1],
+ &rl->rl_recs[middle - delta])) {
+ *split_index = middle - delta;
+ break;
+ }
+
+ /* For even counts, don't walk off the end */
+ if ((middle + delta + 1) == num_used)
+ continue;
+
+ /* Now try delta past middle */
+ if (ocfs2_refcount_rec_no_intersect(
+ &rl->rl_recs[middle + delta],
+ &rl->rl_recs[middle + delta + 1])) {
+ *split_index = middle + delta + 1;
+ break;
+ }
+ }
+
+ if (delta >= middle)
+ return -ENOSPC;
+
+ *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+ return 0;
+}
+
+static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
+ struct buffer_head *new_bh,
+ u32 *split_cpos)
+{
+ int split_index = 0, num_moved, ret;
+ u32 cpos = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rl = &rb->rf_records;
+ struct ocfs2_refcount_block *new_rb =
+ (struct ocfs2_refcount_block *)new_bh->b_data;
+ struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+
+ trace_ocfs2_divide_leaf_refcount_block(
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
+
+ /*
+ * XXX: Improvement later.
+ * If we know all the high 32 bit cpos is the same, no need to sort.
+ *
+ * In order to make the whole process safe, we do:
+ * 1. sort the entries by their low 32 bit cpos first so that we can
+ * find the split cpos easily.
+ * 2. call ocfs2_insert_extent to insert the new refcount block.
+ * 3. move the refcount rec to the new block.
+ * 4. sort the entries by their 64 bit cpos.
+ * 5. dirty the new_rb and rb.
+ */
+ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+ ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ new_rb->rf_cpos = cpu_to_le32(cpos);
+
+ /* move refcount records starting from split_index to the new block. */
+ num_moved = le16_to_cpu(rl->rl_used) - split_index;
+ memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+ num_moved * sizeof(struct ocfs2_refcount_rec));
+
+ /*ok, remove the entries we just moved over to the other block. */
+ memset(&rl->rl_recs[split_index], 0,
+ num_moved * sizeof(struct ocfs2_refcount_rec));
+
+ /* change old and new rl_used accordingly. */
+ le16_add_cpu(&rl->rl_used, -num_moved);
+ new_rl->rl_used = cpu_to_le16(num_moved);
+
+ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+ sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+ *split_cpos = cpos;
+ return 0;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got, new_cpos;
+ u64 suballoc_loc, blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *root_rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *new_rb;
+ struct ocfs2_extent_tree ref_et;
+
+ BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &suballoc_bit_start, &num_got,
+ &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_bh = sb_getblk(sb, blkno);
+ if (new_bh == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Initialize ocfs2_refcount_block. */
+ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ memset(new_rb, 0, sb->s_blocksize);
+ strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+ new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ new_rb->rf_blkno = cpu_to_le64(blkno);
+ new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+ new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+ new_rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+ new_rb->rf_generation = root_rb->rf_generation;
+
+ ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+ trace_ocfs2_new_leaf_refcount_block(
+ (unsigned long long)new_bh->b_blocknr, new_cpos);
+
+ /* Insert the new leaf block with the specific offset cpos. */
+ ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
+ 1, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct buffer_head *expand_bh = NULL;
+
+ if (ref_root_bh == ref_leaf_bh) {
+ /*
+ * the old root bh hasn't been expanded to a b-tree,
+ * so expand it first.
+ */
+ ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+ &expand_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ expand_bh = ref_leaf_bh;
+ get_bh(expand_bh);
+ }
+
+
+ /* Now add a new refcount block into the tree.*/
+ ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+ expand_bh, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(expand_bh);
+ return ret;
+}
+
+/*
+ * Adjust the extent rec in b-tree representing ref_leaf_bh.
+ *
+ * Only called when we have inserted a new refcount rec at index 0
+ * which means ocfs2_extent_rec.e_cpos may need some change.
+ */
+static int ocfs2_adjust_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *rec)
+{
+ int ret = 0, i;
+ u32 new_cpos, old_cpos;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ struct ocfs2_extent_list *el;
+
+ if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
+ goto out;
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ old_cpos = le32_to_cpu(rb->rf_cpos);
+ new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+ if (old_cpos <= new_cpos)
+ goto out;
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, path, old_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * 2 more credits, one for the leaf refcount block, one for
+ * the extent block contains the extent rec.
+ */
+ ret = ocfs2_extend_trans(handle, 2);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* change the leaf extent block first. */
+ el = path_leaf_el(path);
+
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
+ if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
+ break;
+
+ BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
+
+ el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+
+ /* change the r_cpos in the leaf block. */
+ rb->rf_cpos = cpu_to_le32(new_cpos);
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *rec,
+ int index, int merge,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+ struct buffer_head *new_bh = NULL;
+
+ BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+ if (rf_list->rl_used == rf_list->rl_count) {
+ u64 cpos = le64_to_cpu(rec->r_cpos);
+ u32 len = le32_to_cpu(rec->r_clusters);
+
+ ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, NULL, &index,
+ &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ref_leaf_bh = new_bh;
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ rf_list = &rb->rf_records;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (index < le16_to_cpu(rf_list->rl_used))
+ memmove(&rf_list->rl_recs[index + 1],
+ &rf_list->rl_recs[index],
+ (le16_to_cpu(rf_list->rl_used) - index) *
+ sizeof(struct ocfs2_refcount_rec));
+
+ trace_ocfs2_insert_refcount_rec(
+ (unsigned long long)ref_leaf_bh->b_blocknr, index,
+ (unsigned long long)le64_to_cpu(rec->r_cpos),
+ le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
+
+ rf_list->rl_recs[index] = *rec;
+
+ le16_add_cpu(&rf_list->rl_used, 1);
+
+ if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+ if (index == 0) {
+ ret = ocfs2_adjust_refcount_rec(handle, ci,
+ ref_root_bh,
+ ref_leaf_bh, rec);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *split_rec,
+ int index, int merge,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, recs_need;
+ u32 len;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+ struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+ struct ocfs2_refcount_rec *tail_rec = NULL;
+ struct buffer_head *new_bh = NULL;
+
+ BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+ trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+ le32_to_cpu(orig_rec->r_clusters),
+ le32_to_cpu(orig_rec->r_refcount),
+ le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount));
+
+ /*
+ * If we just need to split the header or tail clusters,
+ * no more recs are needed, just split is OK.
+ * Otherwise we at least need one new recs.
+ */
+ if (!split_rec->r_refcount &&
+ (split_rec->r_cpos == orig_rec->r_cpos ||
+ le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters) ==
+ le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+ recs_need = 0;
+ else
+ recs_need = 1;
+
+ /*
+ * We need one more rec if we split in the middle and the new rec have
+ * some refcount in it.
+ */
+ if (split_rec->r_refcount &&
+ (split_rec->r_cpos != orig_rec->r_cpos &&
+ le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters) !=
+ le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+ recs_need++;
+
+ /* If the leaf block don't have enough record, expand it. */
+ if (le16_to_cpu(rf_list->rl_used) + recs_need >
+ le16_to_cpu(rf_list->rl_count)) {
+ struct ocfs2_refcount_rec tmp_rec;
+ u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+ len = le32_to_cpu(orig_rec->r_clusters);
+ ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We have to re-get it since now cpos may be moved to
+ * another leaf block.
+ */
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &tmp_rec, &index,
+ &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ref_leaf_bh = new_bh;
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ rf_list = &rb->rf_records;
+ orig_rec = &rf_list->rl_recs[index];
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We have calculated out how many new records we need and store
+ * in recs_need, so spare enough space first by moving the records
+ * after "index" to the end.
+ */
+ if (index != le16_to_cpu(rf_list->rl_used) - 1)
+ memmove(&rf_list->rl_recs[index + 1 + recs_need],
+ &rf_list->rl_recs[index + 1],
+ (le16_to_cpu(rf_list->rl_used) - index - 1) *
+ sizeof(struct ocfs2_refcount_rec));
+
+ len = (le64_to_cpu(orig_rec->r_cpos) +
+ le32_to_cpu(orig_rec->r_clusters)) -
+ (le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters));
+
+ /*
+ * If we have "len", the we will split in the tail and move it
+ * to the end of the space we have just spared.
+ */
+ if (len) {
+ tail_rec = &rf_list->rl_recs[index + recs_need];
+
+ memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+ le64_add_cpu(&tail_rec->r_cpos,
+ le32_to_cpu(tail_rec->r_clusters) - len);
+ tail_rec->r_clusters = cpu_to_le32(len);
+ }
+
+ /*
+ * If the split pos isn't the same as the original one, we need to
+ * split in the head.
+ *
+ * Note: We have the chance that split_rec.r_refcount = 0,
+ * recs_need = 0 and len > 0, which means we just cut the head from
+ * the orig_rec and in that case we have done some modification in
+ * orig_rec above, so the check for r_cpos is faked.
+ */
+ if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
+ len = le64_to_cpu(split_rec->r_cpos) -
+ le64_to_cpu(orig_rec->r_cpos);
+ orig_rec->r_clusters = cpu_to_le32(len);
+ index++;
+ }
+
+ le16_add_cpu(&rf_list->rl_used, recs_need);
+
+ if (split_rec->r_refcount) {
+ rf_list->rl_recs[index] = *split_rec;
+ trace_ocfs2_split_refcount_rec_insert(
+ (unsigned long long)ref_leaf_bh->b_blocknr, index,
+ (unsigned long long)le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount));
+
+ if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+ }
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len, int merge,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0, index;
+ struct buffer_head *ref_leaf_bh = NULL;
+ struct ocfs2_refcount_rec rec;
+ unsigned int set_len = 0;
+
+ trace_ocfs2_increase_refcount_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len);
+
+ while (len) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ set_len = le32_to_cpu(rec.r_clusters);
+
+ /*
+ * Here we may meet with 3 situations:
+ *
+ * 1. If we find an already existing record, and the length
+ * is the same, cool, we just need to increase the r_refcount
+ * and it is OK.
+ * 2. If we find a hole, just insert it with r_refcount = 1.
+ * 3. If we are in the middle of one extent record, split
+ * it.
+ */
+ if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+ set_len <= len) {
+ trace_ocfs2_increase_refcount_change(
+ (unsigned long long)cpos, set_len,
+ le32_to_cpu(rec.r_refcount));
+ ret = ocfs2_change_refcount_rec(handle, ci,
+ ref_leaf_bh, index,
+ merge, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (!rec.r_refcount) {
+ rec.r_refcount = cpu_to_le32(1);
+
+ trace_ocfs2_increase_refcount_insert(
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ set_len);
+ ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+ ref_leaf_bh,
+ &rec, index,
+ merge, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ set_len = min((u64)(cpos + len),
+ le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+ rec.r_cpos = cpu_to_le64(cpos);
+ rec.r_clusters = cpu_to_le32(set_len);
+ le32_add_cpu(&rec.r_refcount, 1);
+
+ trace_ocfs2_increase_refcount_split(
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ set_len, le32_to_cpu(rec.r_refcount));
+ ret = ocfs2_split_refcount_rec(handle, ci,
+ ref_root_bh, ref_leaf_bh,
+ &rec, index, merge,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += set_len;
+ len -= set_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_extent_tree et;
+
+ BUG_ON(rb->rf_records.rl_used);
+
+ trace_ocfs2_remove_refcount_extent(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le32_to_cpu(rb->rf_cpos));
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+ ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+ 1, meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_from_cache(ci, ref_leaf_bh);
+
+ /*
+ * add the freed block to the dealloc so that it will be freed
+ * when we run dealloc.
+ */
+ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(rb->rf_suballoc_slot),
+ le64_to_cpu(rb->rf_suballoc_loc),
+ le64_to_cpu(rb->rf_blkno),
+ le16_to_cpu(rb->rf_suballoc_bit));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ le32_add_cpu(&rb->rf_clusters, -1);
+
+ /*
+ * check whether we need to restore the root refcount block if
+ * there is no leaf extent block at atll.
+ */
+ if (!rb->rf_list.l_next_free_rec) {
+ BUG_ON(rb->rf_clusters);
+
+ trace_ocfs2_restore_refcount_block(
+ (unsigned long long)ref_root_bh->b_blocknr);
+
+ rb->rf_flags = 0;
+ rb->rf_parent = 0;
+ rb->rf_cpos = 0;
+ memset(&rb->rf_records, 0, sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_records));
+ rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+ }
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+ return ret;
+}
+
+int ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+ cpos, len, 1,
+ meta_ac, dealloc);
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ int index, u64 cpos, unsigned int len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+ BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+ BUG_ON(cpos + len >
+ le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+ trace_ocfs2_decrease_refcount_rec(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len);
+
+ if (cpos == le64_to_cpu(rec->r_cpos) &&
+ len == le32_to_cpu(rec->r_clusters))
+ ret = ocfs2_change_refcount_rec(handle, ci,
+ ref_leaf_bh, index, 1, -1);
+ else {
+ struct ocfs2_refcount_rec split = *rec;
+ split.r_cpos = cpu_to_le64(cpos);
+ split.r_clusters = cpu_to_le32(len);
+
+ le32_add_cpu(&split.r_refcount, -1);
+
+ ret = ocfs2_split_refcount_rec(handle, ci,
+ ref_root_bh, ref_leaf_bh,
+ &split, index, 1,
+ meta_ac, dealloc);
+ }
+
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Remove the leaf refcount block if it contains no refcount record. */
+ if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+ ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete)
+{
+ int ret = 0, index = 0;
+ struct ocfs2_refcount_rec rec;
+ unsigned int r_count = 0, r_len;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct buffer_head *ref_leaf_bh = NULL;
+
+ trace_ocfs2_decrease_refcount(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len, delete);
+
+ while (len) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ r_count = le32_to_cpu(rec.r_refcount);
+ BUG_ON(r_count == 0);
+ if (!delete)
+ BUG_ON(r_count > 1);
+
+ r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - cpos;
+
+ ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+ ref_leaf_bh, index,
+ cpos, r_len,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
+ ret = ocfs2_cache_cluster_dealloc(dealloc,
+ ocfs2_clusters_to_blocks(sb, cpos),
+ r_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += r_len;
+ len -= r_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+/* Caller must hold refcount tree lock. */
+int ocfs2_decrease_refcount(struct inode *inode,
+ handle_t *handle, u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete)
+{
+ int ret;
+ u64 ref_blkno;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *tree;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
+ cpos, len, meta_ac, dealloc, delete);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos,
+ u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+ cpos, len, phys);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ OCFS2_EXT_REFCOUNTED, 0);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 start_cpos,
+ u32 clusters,
+ int *meta_add,
+ int *credits)
+{
+ int ret = 0, index, ref_blocks = 0, recs_add = 0;
+ u64 cpos = start_cpos;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_rec rec;
+ struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+ u32 len;
+
+ while (clusters) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, clusters, &rec,
+ &index, &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ref_leaf_bh != prev_bh) {
+ /*
+ * Now we encounter a new leaf block, so calculate
+ * whether we need to extend the old leaf.
+ */
+ if (prev_bh) {
+ rb = (struct ocfs2_refcount_block *)
+ prev_bh->b_data;
+
+ if (le16_to_cpu(rb->rf_records.rl_used) +
+ recs_add >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+ }
+
+ recs_add = 0;
+ *credits += 1;
+ brelse(prev_bh);
+ prev_bh = ref_leaf_bh;
+ get_bh(prev_bh);
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+ trace_ocfs2_calc_refcount_meta_credits_iterate(
+ recs_add, (unsigned long long)cpos, clusters,
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ le32_to_cpu(rec.r_clusters),
+ le32_to_cpu(rec.r_refcount), index);
+
+ len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - cpos;
+ /*
+ * We record all the records which will be inserted to the
+ * same refcount block, so that we can tell exactly whether
+ * we need a new refcount block or not.
+ *
+ * If we will insert a new one, this is easy and only happens
+ * during adding refcounted flag to the extent, so we don't
+ * have a chance of spliting. We just need one record.
+ *
+ * If the refcount rec already exists, that would be a little
+ * complicated. we may have to:
+ * 1) split at the beginning if the start pos isn't aligned.
+ * we need 1 more record in this case.
+ * 2) split int the end if the end pos isn't aligned.
+ * we need 1 more record in this case.
+ * 3) split in the middle because of file system fragmentation.
+ * we need 2 more records in this case(we can't detect this
+ * beforehand, so always think of the worst case).
+ */
+ if (rec.r_refcount) {
+ recs_add += 2;
+ /* Check whether we need a split at the beginning. */
+ if (cpos == start_cpos &&
+ cpos != le64_to_cpu(rec.r_cpos))
+ recs_add++;
+
+ /* Check whether we need a split in the end. */
+ if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters))
+ recs_add++;
+ } else
+ recs_add++;
+
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ clusters -= len;
+ cpos += len;
+ }
+
+ if (prev_bh) {
+ rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
+
+ if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+
+ *credits += 1;
+ }
+
+ if (!ref_blocks)
+ goto out;
+
+ *meta_add += ref_blocks;
+ *credits += ref_blocks;
+
+ /*
+ * So we may need ref_blocks to insert into the tree.
+ * That also means we need to change the b-tree and add that number
+ * of records since we never merge them.
+ * We need one more block for expansion since the new created leaf
+ * block is also full and needs split.
+ */
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+ *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
+ *credits += ocfs2_calc_extend_credits(sb,
+ et.et_root_el);
+ } else {
+ *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ *meta_add += 1;
+ }
+
+out:
+
+ trace_ocfs2_calc_refcount_meta_credits(
+ (unsigned long long)start_cpos, clusters,
+ *meta_add, *credits);
+ brelse(ref_leaf_bh);
+ brelse(prev_bh);
+ return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * contiguous also, so that we can get the number easily.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+ u64 refcount_loc,
+ u64 phys_blkno,
+ u32 clusters,
+ int *credits,
+ int *ref_blocks)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *tree;
+ u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ ret = -EROFS;
+ goto out;
+ }
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+ refcount_loc, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+ &tree->rf_ci,
+ ref_root_bh,
+ start_cpos, clusters,
+ ref_blocks, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
+
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+#define MAX_CONTIG_BYTES 1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+ return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+ return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
+ * find an offset (start + (n * contig_clusters)) that is closest to cpos
+ * while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+ unsigned int start,
+ unsigned int cpos)
+{
+ BUG_ON(start > cpos);
+
+ return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+ unsigned int len)
+{
+ unsigned int padded =
+ (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+ ocfs2_cow_contig_mask(sb);
+
+ /* Did we wrap? */
+ if (padded < len)
+ padded = UINT_MAX;
+
+ return padded;
+}
+
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ * max_cpos is the place where we want to stop CoW intentionally.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ u32 cpos,
+ u32 write_len,
+ u32 max_cpos,
+ u32 *cow_start,
+ u32 *cow_len)
+{
+ int ret = 0;
+ int tree_height = le16_to_cpu(el->l_tree_depth), i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb = NULL;
+ struct ocfs2_extent_rec *rec;
+ unsigned int want_clusters, rec_end = 0;
+ int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+ int leaf_clusters;
+
+ BUG_ON(cpos + write_len > max_cpos);
+
+ if (tree_height > 0) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ *cow_len = 0;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+
+ if (ocfs2_is_empty_extent(rec)) {
+ mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+ "index %d\n", inode->i_ino, i);
+ continue;
+ }
+
+ if (le32_to_cpu(rec->e_cpos) +
+ le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+ continue;
+
+ if (*cow_len == 0) {
+ /*
+ * We should find a refcounted record in the
+ * first pass.
+ */
+ BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+ *cow_start = le32_to_cpu(rec->e_cpos);
+ }
+
+ /*
+ * If we encounter a hole, a non-refcounted record or
+ * pass the max_cpos, stop the search.
+ */
+ if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+ (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
+ (max_cpos <= le32_to_cpu(rec->e_cpos)))
+ break;
+
+ leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+ if (rec_end > max_cpos) {
+ rec_end = max_cpos;
+ leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
+ }
+
+ /*
+ * How many clusters do we actually need from
+ * this extent? First we see how many we actually
+ * need to complete the write. If that's smaller
+ * than contig_clusters, we try for contig_clusters.
+ */
+ if (!*cow_len)
+ want_clusters = write_len;
+ else
+ want_clusters = (cpos + write_len) -
+ (*cow_start + *cow_len);
+ if (want_clusters < contig_clusters)
+ want_clusters = contig_clusters;
+
+ /*
+ * If the write does not cover the whole extent, we
+ * need to calculate how we're going to split the extent.
+ * We try to do it on contig_clusters boundaries.
+ *
+ * Any extent smaller than contig_clusters will be
+ * CoWed in its entirety.
+ */
+ if (leaf_clusters <= contig_clusters)
+ *cow_len += leaf_clusters;
+ else if (*cow_len || (*cow_start == cpos)) {
+ /*
+ * This extent needs to be CoW'd from its
+ * beginning, so all we have to do is compute
+ * how many clusters to grab. We align
+ * want_clusters to the edge of contig_clusters
+ * to get better I/O.
+ */
+ want_clusters = ocfs2_cow_align_length(inode->i_sb,
+ want_clusters);
+
+ if (leaf_clusters < want_clusters)
+ *cow_len += leaf_clusters;
+ else
+ *cow_len += want_clusters;
+ } else if ((*cow_start + contig_clusters) >=
+ (cpos + write_len)) {
+ /*
+ * Breaking off contig_clusters at the front
+ * of the extent will cover our write. That's
+ * easy.
+ */
+ *cow_len = contig_clusters;
+ } else if ((rec_end - cpos) <= contig_clusters) {
+ /*
+ * Breaking off contig_clusters at the tail of
+ * this extent will cover cpos.
+ */
+ *cow_start = rec_end - contig_clusters;
+ *cow_len = contig_clusters;
+ } else if ((rec_end - cpos) <= want_clusters) {
+ /*
+ * While we can't fit the entire write in this
+ * extent, we know that the write goes from cpos
+ * to the end of the extent. Break that off.
+ * We try to break it at some multiple of
+ * contig_clusters from the front of the extent.
+ * Failing that (ie, cpos is within
+ * contig_clusters of the front), we'll CoW the
+ * entire extent.
+ */
+ *cow_start = ocfs2_cow_align_start(inode->i_sb,
+ *cow_start, cpos);
+ *cow_len = rec_end - *cow_start;
+ } else {
+ /*
+ * Ok, the entire write lives in the middle of
+ * this extent. Let's try to slice the extent up
+ * nicely. Optimally, our CoW region starts at
+ * m*contig_clusters from the beginning of the
+ * extent and goes for n*contig_clusters,
+ * covering the entire write.
+ */
+ *cow_start = ocfs2_cow_align_start(inode->i_sb,
+ *cow_start, cpos);
+
+ want_clusters = (cpos + write_len) - *cow_start;
+ want_clusters = ocfs2_cow_align_length(inode->i_sb,
+ want_clusters);
+ if (*cow_start + want_clusters <= rec_end)
+ *cow_len = want_clusters;
+ else
+ *cow_len = rec_end - *cow_start;
+ }
+
+ /* Have we covered our entire write yet? */
+ if ((*cow_start + *cow_len) >= (cpos + write_len))
+ break;
+
+ /*
+ * If we reach the end of the extent block and don't get enough
+ * clusters, continue with the next extent block if possible.
+ */
+ if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+ eb && eb->h_next_leaf_blk) {
+ brelse(eb_bh);
+ eb_bh = NULL;
+
+ ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ i = -1;
+ }
+ }
+
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ * more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ * just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+ u32 p_cluster, u32 num_clusters,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int *credits)
+{
+ int ret = 0, meta_add = 0;
+ int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (num_free_extents < num_clusters + 2)
+ meta_add =
+ ocfs2_extend_meta_needed(et->et_root_el);
+
+ *credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
+
+ ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &meta_add, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+ data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+ BUG_ON(buffer_dirty(bh));
+
+ clear_buffer_mapped(bh);
+
+ return 0;
+}
+
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
+{
+ int ret = 0, partial;
+ struct super_block *sb = inode->i_sb;
+ u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+ struct page *page;
+ pgoff_t page_index;
+ unsigned int from, to, readahead_pages;
+ loff_t offset, end, map_end;
+ struct address_space *mapping = inode->i_mapping;
+
+ trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+ new_cluster, new_len);
+
+ readahead_pages =
+ (ocfs2_cow_contig_clusters(sb) <<
+ OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
+ offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+ /*
+ * We only duplicate pages until we reach the page contains i_size - 1.
+ * So trim 'end' to i_size.
+ */
+ if (end > i_size_read(inode))
+ end = i_size_read(inode);
+
+ while (offset < end) {
+ page_index = offset >> PAGE_CACHE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ if (map_end > end)
+ map_end = end;
+
+ /* from, to is the offset within the page. */
+ from = offset & (PAGE_CACHE_SIZE - 1);
+ to = PAGE_CACHE_SIZE;
+ if (map_end & (PAGE_CACHE_SIZE - 1))
+ to = map_end & (PAGE_CACHE_SIZE - 1);
+
+ page = find_or_create_page(mapping, page_index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+ * can't be dirtied before we CoW it out.
+ */
+ if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+ BUG_ON(PageDirty(page));
+
+ if (!PageUptodate(page)) {
+ ret = block_read_full_page(page, ocfs2_get_block);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ lock_page(page);
+ }
+
+ if (page_has_buffers(page)) {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_clear_cow_buffer);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ocfs2_map_and_dirty_page(inode,
+ handle, from, to,
+ page, 0, &new_block);
+ mark_page_accessed(page);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ offset = map_end;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
+{
+ int ret = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+ int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+ u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+ u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct buffer_head *old_bh = NULL;
+ struct buffer_head *new_bh = NULL;
+
+ trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+ new_cluster, new_len);
+
+ for (i = 0; i < blocks; i++, old_block++, new_block++) {
+ new_bh = sb_getblk(osb->sb, new_block);
+ if (new_bh == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_journal_access(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ brelse(new_bh);
+ brelse(old_bh);
+ new_bh = NULL;
+ old_bh = NULL;
+ }
+
+ brelse(new_bh);
+ brelse(old_bh);
+ return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 p_cluster, u32 len,
+ unsigned int ext_flags,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ struct ocfs2_extent_rec replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+ trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+ cpos, len, p_cluster, ext_flags);
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+ p_cluster));
+ replace_rec.e_flags = ext_flags;
+ replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, et, path, index,
+ &replace_rec, meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 old,
+ u32 new, u32 len,
+ unsigned int ext_flags)
+{
+ int ret;
+ struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ u64 ino = ocfs2_metadata_cache_owner(ci);
+
+ trace_ocfs2_replace_clusters((unsigned long long)ino,
+ cpos, old, new, len, ext_flags);
+
+ /*If the old clusters is unwritten, no need to duplicate. */
+ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = context->cow_duplicate_clusters(handle, context->inode,
+ cpos, old, new, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
+ cpos, new, len, ext_flags,
+ context->meta_ac, &context->dealloc);
+ if (ret)
+ mlog_errno(ret);
+out:
+ return ret;
+}
+
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters)
+{
+ int ret = 0;
+ loff_t offset, end, map_end;
+ pgoff_t page_index;
+ struct page *page;
+
+ if (ocfs2_should_order_data(inode))
+ return 0;
+
+ offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping,
+ offset, end - 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ while (offset < end) {
+ page_index = offset >> PAGE_CACHE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ if (map_end > end)
+ map_end = end;
+
+ page = find_or_create_page(inode->i_mapping,
+ page_index, GFP_NOFS);
+ BUG_ON(!page);
+
+ wait_on_page_writeback(page);
+ if (PageError(page)) {
+ ret = -EIO;
+ mlog_errno(ret);
+ } else
+ mark_page_accessed(page);
+
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ offset = map_end;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 p_cluster,
+ u32 num_clusters, unsigned int e_flags)
+{
+ int ret, delete, index, credits = 0;
+ u32 new_bit, new_len, orig_num_clusters;
+ unsigned int set_len;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ handle_t *handle;
+ struct buffer_head *ref_leaf_bh = NULL;
+ struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
+ struct ocfs2_refcount_rec rec;
+
+ trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+ num_clusters, e_flags);
+
+ ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+ &context->data_et,
+ ref_ci,
+ context->ref_root_bh,
+ &context->meta_ac,
+ &context->data_ac, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (context->post_refcount)
+ credits += context->post_refcount->credits;
+
+ credits += context->extra_credits;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ orig_num_clusters = num_clusters;
+
+ while (num_clusters) {
+ ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
+ p_cluster, num_clusters,
+ &rec, &index, &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ BUG_ON(!rec.r_refcount);
+ set_len = min((u64)p_cluster + num_clusters,
+ le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+ /*
+ * There are many different situation here.
+ * 1. If refcount == 1, remove the flag and don't COW.
+ * 2. If refcount > 1, allocate clusters.
+ * Here we may not allocate r_len once at a time, so continue
+ * until we reach num_clusters.
+ */
+ if (le32_to_cpu(rec.r_refcount) == 1) {
+ delete = 0;
+ ret = ocfs2_clear_ext_refcount(handle,
+ &context->data_et,
+ cpos, p_cluster,
+ set_len, e_flags,
+ context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ } else {
+ delete = 1;
+
+ ret = __ocfs2_claim_clusters(handle,
+ context->data_ac,
+ 1, set_len,
+ &new_bit, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_replace_clusters(handle, context,
+ cpos, p_cluster, new_bit,
+ new_len, e_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ set_len = new_len;
+ }
+
+ ret = __ocfs2_decrease_refcount(handle, ref_ci,
+ context->ref_root_bh,
+ p_cluster, set_len,
+ context->meta_ac,
+ &context->dealloc, delete);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ cpos += set_len;
+ p_cluster += set_len;
+ num_clusters -= set_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+ /* handle any post_cow action. */
+ if (context->post_refcount && context->post_refcount->func) {
+ ret = context->post_refcount->func(context->inode, handle,
+ context->post_refcount->para);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ if (context->get_clusters == ocfs2_di_get_clusters) {
+ ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
+ orig_num_clusters);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+ brelse(ref_leaf_bh);
+
+ return ret;
+}
+
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
+{
+ int ret = 0;
+ struct inode *inode = context->inode;
+ u32 cow_start = context->cow_start, cow_len = context->cow_len;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ return -EROFS;
+ }
+
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ while (cow_len) {
+ ret = context->get_clusters(context, cow_start, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+ if (cow_len < num_clusters)
+ num_clusters = cow_len;
+
+ ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+ cow_start, p_cluster,
+ num_clusters, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ cow_len -= num_clusters;
+ cow_start += num_clusters;
+ }
+
+ if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+ }
+
+ return ret;
+}
+
+/*
+ * Starting at cpos, try to CoW write_len clusters. Don't CoW
+ * past max_cpos. This will stop when it runs into a hole or an
+ * unrefcounted extent.
+ */
+static int ocfs2_refcount_cow_hunk(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos)
+{
+ int ret;
+ u32 cow_start = 0, cow_len = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_cow_context *context = NULL;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
+ cpos, write_len, max_cpos,
+ &cow_start, &cow_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+ cpos, write_len, max_cpos,
+ cow_start, cow_len);
+
+ BUG_ON(cow_len == 0);
+
+ context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+ if (!context) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->cow_start = cow_start;
+ context->cow_len = cow_len;
+ context->ref_tree = ref_tree;
+ context->ref_root_bh = ref_root_bh;
+ context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+ context->get_clusters = ocfs2_di_get_clusters;
+
+ ocfs2_init_dinode_extent_tree(&context->data_et,
+ INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_replace_cow(context);
+ if (ret)
+ mlog_errno(ret);
+
+ /*
+ * truncate the extent map here since no matter whether we meet with
+ * any error during the action, we shouldn't trust cached extent map
+ * any more.
+ */
+ ocfs2_extent_map_trunc(inode, cow_start);
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ kfree(context);
+ return ret;
+}
+
+/*
+ * CoW any and all clusters between cpos and cpos+write_len.
+ * Don't CoW past max_cpos. If this returns successfully, all
+ * clusters between cpos and cpos+write_len are safe to modify.
+ */
+int ocfs2_refcount_cow(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos)
+{
+ int ret = 0;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+
+ while (write_len) {
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ if (write_len < num_clusters)
+ num_clusters = write_len;
+
+ if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+ ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+ num_clusters, max_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ write_len -= num_clusters;
+ cpos += num_clusters;
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ struct inode *inode = context->inode;
+ struct ocfs2_xattr_value_root *xv = context->cow_object;
+
+ return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+ num_clusters, &xv->xr_list,
+ extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int *meta_add, int *credits)
+{
+ int ret = 0, index, ref_blocks = 0;
+ u32 p_cluster, num_clusters;
+ u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_rec rec;
+ struct buffer_head *ref_leaf_bh = NULL;
+
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &xv->xr_list,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos += num_clusters;
+
+ while (num_clusters) {
+ ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!rec.r_refcount);
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+ /*
+ * We really don't know whether the other clusters is in
+ * this refcount block or not, so just take the worst
+ * case that all the clusters are in this block and each
+ * one will split a refcount rec, so totally we need
+ * clusters * 2 new refcount rec.
+ */
+ if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+
+ *credits += 1;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+
+ if (num_clusters <= le32_to_cpu(rec.r_clusters))
+ break;
+ else
+ num_clusters -= le32_to_cpu(rec.r_clusters);
+ p_cluster += num_clusters;
+ }
+ }
+
+ *meta_add += ref_blocks;
+ if (!ref_blocks)
+ goto out;
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ else {
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+ *credits += ocfs2_calc_extend_credits(inode->i_sb,
+ et.et_root_el);
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_refcount_tree *ref_tree,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 write_len,
+ struct ocfs2_post_refcount *post)
+{
+ int ret;
+ struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_cow_context *context = NULL;
+ u32 cow_start, cow_len;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+ cpos, write_len, UINT_MAX,
+ &cow_start, &cow_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(cow_len == 0);
+
+ context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+ if (!context) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->cow_start = cow_start;
+ context->cow_len = cow_len;
+ context->ref_tree = ref_tree;
+ context->ref_root_bh = ref_root_bh;
+ context->cow_object = xv;
+
+ context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+ /* We need the extra credits for duplicate_clusters by jbd. */
+ context->extra_credits =
+ ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
+ context->get_clusters = ocfs2_xattr_value_get_clusters;
+ context->post_refcount = post;
+
+ ocfs2_init_xattr_value_extent_tree(&context->data_et,
+ INODE_CACHE(inode), vb);
+
+ ret = ocfs2_replace_cow(context);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ kfree(context);
+ return ret;
+}
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+int ocfs2_add_refcount_flag(struct inode *inode,
+ struct ocfs2_extent_tree *data_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *post)
+{
+ int ret;
+ handle_t *handle;
+ int credits = 1, ref_blocks = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+ ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &ref_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_add_refcount_flag(ref_blocks, credits);
+
+ if (ref_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ref_blocks, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (post)
+ credits += post->credits;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
+ cpos, num_clusters, p_cluster,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+ p_cluster, num_clusters, 0,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (post && post->func) {
+ ret = post->func(inode, handle, post->para);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_change_ctime(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret, data_changed = 0;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_refcount_tree *ref_tree;
+ unsigned int ext_flags;
+ loff_t size;
+ u32 cpos, num_clusters, clusters, p_cluster;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree di_et;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+ ret = ocfs2_create_refcount_tree(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ BUG_ON(!di->i_refcount_loc);
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(di->i_refcount_loc), 1,
+ &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ goto attach_xattr;
+
+ ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
+
+ size = i_size_read(inode);
+ clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(inode, &di_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, cpos,
+ p_cluster, num_clusters,
+ &dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ data_changed = 1;
+ }
+ cpos += num_clusters;
+ }
+
+attach_xattr:
+ if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ if (data_changed) {
+ ret = ocfs2_change_ctime(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+unlock:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+
+ if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+out:
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode, 0);
+
+ return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ unsigned int ext_flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ handle_t *handle;
+ int credits = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+ p_cluster, num_clusters,
+ et, ref_ci,
+ ref_root_bh, &meta_ac,
+ NULL, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_insert_extent(handle, et, cpos,
+ ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
+ num_clusters, ext_flags, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+ BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+ memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+ le16_to_cpu(s_di->id2.i_data.id_count));
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+ t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ u32 p_cluster, num_clusters, clusters, cpos;
+ loff_t size;
+ unsigned int ext_flags;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+ size = i_size_read(s_inode);
+ clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (p_cluster) {
+ ret = ocfs2_add_refcounted_extent(t_inode, &et,
+ ref_ci, ref_root_bh,
+ cpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += num_clusters;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * change the new file's attributes to the src.
+ *
+ * reflink creates a snapshot of a file, that means the attributes
+ * must be identical except for three exceptions - nlink, ino, and ctime.
+ */
+static int ocfs2_complete_reflink(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ bool preserve)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
+ loff_t size = i_size_read(s_inode);
+
+ handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+ OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
+ OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+ i_size_write(t_inode, size);
+ t_inode->i_blocks = s_inode->i_blocks;
+
+ di->i_xattr_inline_size = s_di->i_xattr_inline_size;
+ di->i_clusters = s_di->i_clusters;
+ di->i_size = s_di->i_size;
+ di->i_dyn_features = s_di->i_dyn_features;
+ di->i_attr = s_di->i_attr;
+
+ if (preserve) {
+ t_inode->i_uid = s_inode->i_uid;
+ t_inode->i_gid = s_inode->i_gid;
+ t_inode->i_mode = s_inode->i_mode;
+ di->i_uid = s_di->i_uid;
+ di->i_gid = s_di->i_gid;
+ di->i_mode = s_di->i_mode;
+
+ /*
+ * update time.
+ * we want mtime to appear identical to the source and
+ * update ctime.
+ */
+ t_inode->i_ctime = CURRENT_TIME;
+
+ di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+
+ t_inode->i_mtime = s_inode->i_mtime;
+ di->i_mtime = s_di->i_mtime;
+ di->i_mtime_nsec = s_di->i_mtime_nsec;
+ }
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+ return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ bool preserve)
+{
+ int ret;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_refcount_tree *ref_tree;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(di->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+ t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+ &ref_tree->rf_ci, ref_root_bh,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ bool preserve)
+{
+ int ret;
+ struct inode *inode = old_dentry->d_inode;
+ struct buffer_head *new_bh = NULL;
+
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = filemap_fdatawrite(inode->i_mapping);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_attach_refcount_tree(inode, old_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+ OI_LS_REFLINK_TARGET);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_create_reflink_node(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret) {
+ mlog_errno(ret);
+ goto inode_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ ret = ocfs2_reflink_xattrs(inode, old_bh,
+ new_inode, new_bh,
+ preserve);
+ if (ret) {
+ mlog_errno(ret);
+ goto inode_unlock;
+ }
+ }
+
+ ret = ocfs2_complete_reflink(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret)
+ mlog_errno(ret);
+
+inode_unlock:
+ ocfs2_inode_unlock(new_inode, 1);
+ brelse(new_bh);
+out_unlock:
+ mutex_unlock(&new_inode->i_mutex);
+out:
+ if (!ret) {
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret)
+ mlog_errno(ret);
+ }
+ return ret;
+}
+
+static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool preserve)
+{
+ int error;
+ struct inode *inode = old_dentry->d_inode;
+ struct buffer_head *old_bh = NULL;
+ struct inode *new_orphan_inode = NULL;
+ struct posix_acl *default_acl, *acl;
+ umode_t mode;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ mode = inode->i_mode;
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_create_inode_in_orphan(dir, mode,
+ &new_orphan_inode);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_rw_lock(inode, 1);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_inode_lock(inode, &old_bh, 1);
+ if (error) {
+ mlog_errno(error);
+ ocfs2_rw_unlock(inode, 1);
+ goto out;
+ }
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ error = __ocfs2_reflink(old_dentry, old_bh,
+ new_orphan_inode, preserve);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
+ brelse(old_bh);
+
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ /* If the security isn't preserved, we need to re-initialize them. */
+ if (!preserve) {
+ error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ &new_dentry->d_name,
+ default_acl, acl);
+ if (error)
+ mlog_errno(error);
+ }
+out:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
+ if (!error) {
+ error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ new_dentry);
+ if (error)
+ mlog_errno(error);
+ }
+
+ if (new_orphan_inode) {
+ /*
+ * We need to open_unlock the inode no matter whether we
+ * succeed or not, so that other nodes can delete it later.
+ */
+ ocfs2_open_unlock(new_orphan_inode);
+ if (error)
+ iput(new_orphan_inode);
+ }
+
+ return error;
+}
+
+/*
+ * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
+ * sys_reflink(). This will go away when vfs_reflink() exists in
+ * fs/namei.c.
+ */
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+ if (child->d_inode)
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/**
+ * ocfs2_vfs_reflink - Create a reference-counted link
+ *
+ * @old_dentry: source dentry + inode
+ * @dir: directory to create the target
+ * @new_dentry: target dentry
+ * @preserve: if true, preserve all file attributes
+ */
+static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool preserve)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int error;
+
+ if (!inode)
+ return -ENOENT;
+
+ error = ocfs2_may_create(dir, new_dentry);
+ if (error)
+ return error;
+
+ if (dir->i_sb != inode->i_sb)
+ return -EXDEV;
+
+ /*
+ * A reflink to an append-only or immutable file cannot be created.
+ */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ /* Only regular files can be reflinked. */
+ if (!S_ISREG(inode->i_mode))
+ return -EPERM;
+
+ /*
+ * If the caller wants to preserve ownership, they require the
+ * rights to do so.
+ */
+ if (preserve) {
+ if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
+ return -EPERM;
+ if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
+ return -EPERM;
+ }
+
+ /*
+ * If the caller is modifying any aspect of the attributes, they
+ * are not creating a snapshot. They need read permission on the
+ * file.
+ */
+ if (!preserve) {
+ error = inode_permission(inode, MAY_READ);
+ if (error)
+ return error;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ dquot_initialize(dir);
+ error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+ mutex_unlock(&inode->i_mutex);
+ if (!error)
+ fsnotify_create(dir, new_dentry);
+ return error;
+}
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink_ioctl(struct inode *inode,
+ const char __user *oldname,
+ const char __user *newname,
+ bool preserve)
+{
+ struct dentry *new_dentry;
+ struct path old_path, new_path;
+ int error;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+ if (error) {
+ mlog_errno(error);
+ return error;
+ }
+
+ new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry)) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = -EXDEV;
+ if (old_path.mnt != new_path.mnt) {
+ mlog_errno(error);
+ goto out_dput;
+ }
+
+ error = ocfs2_vfs_reflink(old_path.dentry,
+ new_path.dentry->d_inode,
+ new_dentry, preserve);
+out_dput:
+ done_path_create(&new_path, new_dentry);
+out:
+ path_put(&old_path);
+
+ return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 00000000000..6422bbcdb52
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,118 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+struct ocfs2_refcount_tree {
+ struct rb_node rf_node;
+ u64 rf_blkno;
+ u32 rf_generation;
+ struct kref rf_getcnt;
+ struct rw_semaphore rf_sem;
+ struct ocfs2_lock_res rf_lockres;
+ int rf_removed;
+
+ /* the following 4 fields are used by caching_info. */
+ spinlock_t rf_lock;
+ struct ocfs2_caching_info rf_ci;
+ struct mutex rf_io_mutex;
+ struct super_block *rf_sb;
+};
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+ struct ocfs2_refcount_tree **tree,
+ struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree,
+ int rw);
+
+int ocfs2_decrease_refcount(struct inode *inode,
+ handle_t *handle, u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+ u64 refcount_loc,
+ u64 phys_blkno,
+ u32 clusters,
+ int *credits,
+ int *ref_blocks);
+int ocfs2_refcount_cow(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+ handle_t *handle,
+ void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+ int credits; /* credits it need for journal. */
+ ocfs2_post_refcount_func *func; /* real function. */
+ void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_refcount_tree *ref_tree,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 write_len,
+ struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters);
+int ocfs2_add_refcount_flag(struct inode *inode,
+ struct ocfs2_extent_tree *data_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *post);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh);
+int ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_ioctl(struct inode *inode,
+ const char __user *oldname,
+ const char __user *newname,
+ bool preserve);
+#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 00000000000..41ffd36c689
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,839 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_trace.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define OCFS2_MIN_RESV_WINDOW_BITS 8
+#define OCFS2_MAX_RESV_WINDOW_BITS 1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+ return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ unsigned int bits;
+
+ if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+ /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+ bits = 4 << osb->osb_resv_level;
+ } else {
+ bits = 4 << osb->osb_dir_resv_level;
+ }
+ return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_len)
+ return resv->r_start + resv->r_len - 1;
+ return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+ return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+ if (resmap->m_osb->osb_resv_level == 0)
+ return 1;
+ return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+ int i = 0;
+
+ mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+ osb->dev_str, resmap->m_bitmap_len);
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+ "\tlast_len: %u\n", resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ node = rb_next(node);
+ i++;
+ }
+
+ mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+ i = 0;
+ list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+ mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+ "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ i++;
+ }
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+ int i,
+ struct ocfs2_alloc_reservation *resv)
+{
+ char *disk_bitmap = resmap->m_disk_bitmap;
+ unsigned int start = resv->r_start;
+ unsigned int end = ocfs2_resv_end(resv);
+
+ while (start <= end) {
+ if (ocfs2_test_bit(start, disk_bitmap)) {
+ mlog(ML_ERROR,
+ "reservation %d covers an allocated area "
+ "starting at bit %u!\n", i, start);
+ return 1;
+ }
+
+ start++;
+ }
+ return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+ unsigned int off = 0;
+ int i = 0;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (i > 0 && resv->r_start <= off) {
+ mlog(ML_ERROR, "reservation %d has bad start off!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_len == 0) {
+ mlog(ML_ERROR, "reservation %d has no length!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_start > ocfs2_resv_end(resv)) {
+ mlog(ML_ERROR, "reservation %d has invalid range!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+ mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_validate_resmap_bits(resmap, i, resv))
+ goto bad;
+
+ off = ocfs2_resv_end(resv);
+ node = rb_next(node);
+
+ i++;
+ }
+ return;
+
+bad:
+ ocfs2_dump_resv(resmap);
+ BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+ memset(resv, 0, sizeof(*resv));
+ INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags)
+{
+ BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+ resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap)
+{
+ memset(resmap, 0, sizeof(*resmap));
+
+ resmap->m_osb = osb;
+ resmap->m_reservations = RB_ROOT;
+ /* m_bitmap_len is initialized to zero by the above memset. */
+ INIT_LIST_HEAD(&resmap->m_lru);
+
+ return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ if (!list_empty(&resv->r_lru))
+ list_del_init(&resv->r_lru);
+
+ list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+ resv->r_len = 0;
+ resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+ list_del_init(&resv->r_lru);
+ rb_erase(&resv->r_node, &resmap->m_reservations);
+ resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+ }
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ __ocfs2_resv_trunc(resv);
+ /*
+ * last_len and last_start no longer make sense if
+ * we're changing the range of our allocations.
+ */
+ resv->r_last_len = resv->r_last_start = 0;
+
+ ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv) {
+ spin_lock(&resv_lock);
+ __ocfs2_resv_discard(resmap, resv);
+ spin_unlock(&resv_lock);
+ }
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ assert_spin_locked(&resv_lock);
+
+ while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ __ocfs2_resv_discard(resmap, resv);
+ }
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap)
+{
+ if (ocfs2_resmap_disabled(resmap))
+ return;
+
+ spin_lock(&resv_lock);
+
+ ocfs2_resmap_clear_all_resv(resmap);
+ resmap->m_bitmap_len = clen;
+ resmap->m_disk_bitmap = disk_bitmap;
+
+ spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+ /* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *new)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &root->rb_node;
+ struct ocfs2_alloc_reservation *tmp;
+
+ assert_spin_locked(&resv_lock);
+
+ trace_ocfs2_resv_insert(new->r_start, new->r_len);
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+ if (new->r_start < tmp->r_start) {
+ p = &(*p)->rb_left;
+
+ /*
+ * This is a good place to check for
+ * overlapping reservations.
+ */
+ BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+ } else if (new->r_start > ocfs2_resv_end(tmp)) {
+ p = &(*p)->rb_right;
+ } else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate reservation window!\n");
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, root);
+ new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+ ocfs2_resv_mark_lru(resmap, new);
+
+ ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+ struct ocfs2_alloc_reservation *resv = NULL;
+ struct ocfs2_alloc_reservation *prev_resv = NULL;
+ struct rb_node *node = resmap->m_reservations.rb_node;
+
+ assert_spin_locked(&resv_lock);
+
+ if (!node)
+ return NULL;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+ break;
+
+ /* Check if we overshot the reservation just before goal? */
+ if (resv->r_start > goal) {
+ resv = prev_resv;
+ break;
+ }
+
+ prev_resv = resv;
+ node = rb_next(node);
+ }
+
+ return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+ unsigned int wanted,
+ unsigned int search_start,
+ unsigned int search_len,
+ unsigned int *rstart,
+ unsigned int *rlen)
+{
+ void *bitmap = resmap->m_disk_bitmap;
+ unsigned int best_start, best_len = 0;
+ int offset, start, found;
+
+ trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+ wanted, resmap->m_bitmap_len);
+
+ found = best_start = best_len = 0;
+
+ start = search_start;
+ while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+ start)) != -1) {
+ /* Search reached end of the region */
+ if (offset >= (search_start + search_len))
+ break;
+
+ if (offset == start) {
+ /* we found a zero */
+ found++;
+ /* move start to the next bit to test */
+ start++;
+ } else {
+ /* got a zero after some ones */
+ found = 1;
+ start = offset + 1;
+ }
+ if (found > best_len) {
+ best_len = found;
+ best_start = start - found;
+ }
+
+ if (found >= wanted)
+ break;
+ }
+
+ if (best_len == 0)
+ return 0;
+
+ if (best_len >= wanted)
+ best_len = wanted;
+
+ *rlen = best_len;
+ *rstart = best_start;
+
+ trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
+
+ return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int goal, unsigned int wanted)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ unsigned int gap_start, gap_end, gap_len;
+ struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+ struct rb_node *prev, *next;
+ unsigned int cstart, clen;
+ unsigned int best_start = 0, best_len = 0;
+
+ /*
+ * Nasty cases to consider:
+ *
+ * - rbtree is empty
+ * - our window should be first in all reservations
+ * - our window should be last in all reservations
+ * - need to make sure we don't go past end of bitmap
+ */
+ trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+ goal, wanted, RB_EMPTY_ROOT(root));
+
+ assert_spin_locked(&resv_lock);
+
+ if (RB_EMPTY_ROOT(root)) {
+ /*
+ * Easiest case - empty tree. We can just take
+ * whatever window of free bits we want.
+ */
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ resmap->m_bitmap_len - goal,
+ &cstart, &clen);
+
+ /*
+ * This should never happen - the local alloc window
+ * will always have free bits when we're called.
+ */
+ BUG_ON(goal == 0 && clen == 0);
+
+ if (clen == 0)
+ return;
+
+ resv->r_start = cstart;
+ resv->r_len = clen;
+
+ ocfs2_resv_insert(resmap, resv);
+ return;
+ }
+
+ prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+ if (prev_resv == NULL) {
+ /*
+ * A NULL here means that the search code couldn't
+ * find a window that starts before goal.
+ *
+ * However, we can take the first window after goal,
+ * which is also by definition, the leftmost window in
+ * the entire tree. If we can find free bits in the
+ * gap between goal and the LHS window, then the
+ * reservation can safely be placed there.
+ *
+ * Otherwise we fall back to a linear search, checking
+ * the gaps in between windows for a place to
+ * allocate.
+ */
+
+ next = rb_first(root);
+ next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+ r_node);
+
+ /*
+ * The search should never return such a window. (see
+ * comment above
+ */
+ if (next_resv->r_start <= goal) {
+ mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+ goal, next_resv->r_start, next_resv->r_len);
+ ocfs2_dump_resv(resmap);
+ BUG();
+ }
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ next_resv->r_start - goal,
+ &cstart, &clen);
+ if (clen) {
+ best_len = clen;
+ best_start = cstart;
+ if (best_len == wanted)
+ goto out_insert;
+ }
+
+ prev_resv = next_resv;
+ next_resv = NULL;
+ }
+
+ trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+ ocfs2_resv_end(prev_resv));
+
+ prev = &prev_resv->r_node;
+
+ /* Now we do a linear search for a window, starting at 'prev_rsv' */
+ while (1) {
+ next = rb_next(prev);
+ if (next) {
+ next_resv = rb_entry(next,
+ struct ocfs2_alloc_reservation,
+ r_node);
+
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_end = next_resv->r_start - 1;
+ gap_len = gap_end - gap_start + 1;
+ } else {
+ /*
+ * We're at the rightmost edge of the
+ * tree. See if a reservation between this
+ * window and the end of the bitmap will work.
+ */
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_len = resmap->m_bitmap_len - gap_start;
+ gap_end = resmap->m_bitmap_len - 1;
+ }
+
+ trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+ next ? ocfs2_resv_end(next_resv) : -1);
+ /*
+ * No need to check this gap if we have already found
+ * a larger region of free bits.
+ */
+ if (gap_len <= best_len)
+ goto next_resv;
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+ gap_len, &cstart, &clen);
+ if (clen == wanted) {
+ best_len = clen;
+ best_start = cstart;
+ goto out_insert;
+ } else if (clen > best_len) {
+ best_len = clen;
+ best_start = cstart;
+ }
+
+next_resv:
+ if (!next)
+ break;
+
+ prev = next;
+ prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+ r_node);
+ }
+
+out_insert:
+ if (best_len) {
+ resv->r_start = best_start;
+ resv->r_len = best_len;
+ ocfs2_resv_insert(resmap, resv);
+ }
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ struct ocfs2_alloc_reservation *lru_resv;
+ int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+ unsigned int min_bits;
+
+ if (!tmpwindow)
+ min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+ else
+ min_bits = wanted; /* We at know the temp window will use all
+ * of these bits */
+
+ /*
+ * Take the first reservation off the LRU as our 'target'. We
+ * don't try to be smart about it. There might be a case for
+ * searching based on size but I don't have enough data to be
+ * sure. --Mark (3/16/2010)
+ */
+ lru_resv = list_first_entry(&resmap->m_lru,
+ struct ocfs2_alloc_reservation, r_lru);
+
+ trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+ lru_resv->r_len,
+ ocfs2_resv_end(lru_resv));
+
+ /*
+ * Cannibalize (some or all) of the target reservation and
+ * feed it to the current window.
+ */
+ if (lru_resv->r_len <= min_bits) {
+ /*
+ * Discard completely if size is less than or equal to a
+ * reasonable threshold - 50% of window bits for non temporary
+ * windows.
+ */
+ resv->r_start = lru_resv->r_start;
+ resv->r_len = lru_resv->r_len;
+
+ __ocfs2_resv_discard(resmap, lru_resv);
+ } else {
+ unsigned int shrink;
+ if (tmpwindow)
+ shrink = min_bits;
+ else
+ shrink = lru_resv->r_len / 2;
+
+ lru_resv->r_len -= shrink;
+
+ resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+ resv->r_len = shrink;
+ }
+
+ trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ unsigned int goal = 0;
+
+ BUG_ON(!ocfs2_resv_empty(resv));
+
+ /*
+ * Begin by trying to get a window as close to the previous
+ * one as possible. Using the most recent allocation as a
+ * start goal makes sense.
+ */
+ if (resv->r_last_len) {
+ goal = resv->r_last_start + resv->r_last_len;
+ if (goal >= resmap->m_bitmap_len)
+ goal = 0;
+ }
+
+ __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+ /* Search from last alloc didn't work, try once more from beginning. */
+ if (ocfs2_resv_empty(resv) && goal != 0)
+ __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * Still empty? Pull oldest one off the LRU, remove it from
+ * tree, put this one in it's place.
+ */
+ ocfs2_cannibalize_resv(resmap, resv, wanted);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen)
+{
+ if (resv == NULL || ocfs2_resmap_disabled(resmap))
+ return -ENOSPC;
+
+ spin_lock(&resv_lock);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * We don't want to over-allocate for temporary
+ * windows. Otherwise, we run the risk of fragmenting the
+ * allocation space.
+ */
+ unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+
+ if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+ wanted = *clen;
+
+ /*
+ * Try to get a window here. If it works, we must fall
+ * through and test the bitmap . This avoids some
+ * ping-ponging of windows due to non-reserved space
+ * being allocation before we initialize a window for
+ * that inode.
+ */
+ ocfs2_resv_find_window(resmap, resv, wanted);
+ trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+
+ *cstart = resv->r_start;
+ *clen = resv->r_len;
+
+ spin_unlock(&resv_lock);
+ return 0;
+}
+
+static void
+ ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int start, unsigned int end)
+{
+ unsigned int rhs = 0;
+ unsigned int old_end = ocfs2_resv_end(resv);
+
+ BUG_ON(start != resv->r_start || old_end < end);
+
+ /*
+ * Completely used? We can remove it then.
+ */
+ if (old_end == end) {
+ __ocfs2_resv_discard(resmap, resv);
+ return;
+ }
+
+ rhs = old_end - end;
+
+ /*
+ * This should have been trapped above.
+ */
+ BUG_ON(rhs == 0);
+
+ resv->r_start = end + 1;
+ resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen)
+{
+ unsigned int cend = cstart + clen - 1;
+
+ if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+ return;
+
+ if (resv == NULL)
+ return;
+
+ BUG_ON(cstart != resv->r_start);
+
+ spin_lock(&resv_lock);
+
+ trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len,
+ resv->r_last_start,
+ resv->r_last_len);
+
+ BUG_ON(cstart < resv->r_start);
+ BUG_ON(cstart > ocfs2_resv_end(resv));
+ BUG_ON(cend > ocfs2_resv_end(resv));
+
+ ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+ resv->r_last_start = cstart;
+ resv->r_last_len = clen;
+
+ /*
+ * May have been discarded above from
+ * ocfs2_adjust_resv_from_alloc().
+ */
+ if (!ocfs2_resv_empty(resv))
+ ocfs2_resv_mark_lru(resmap, resv);
+
+ trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ ocfs2_check_resmap(resmap);
+
+ spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 00000000000..42c2b804f3f
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL 2
+#define OCFS2_MAX_RESV_LEVEL 9
+#define OCFS2_MIN_RESV_LEVEL 0
+
+struct ocfs2_alloc_reservation {
+ struct rb_node r_node;
+
+ unsigned int r_start; /* Beginning of current window */
+ unsigned int r_len; /* Length of the window */
+
+ unsigned int r_last_len; /* Length of most recent alloc */
+ unsigned int r_last_start; /* Start of most recent alloc */
+ struct list_head r_lru; /* LRU list head */
+
+ unsigned int r_flags;
+};
+
+#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
+ * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
+ * directory btree */
+
+struct ocfs2_reservation_map {
+ struct rb_root m_reservations;
+ char *m_disk_bitmap;
+
+ struct ocfs2_super *m_osb;
+
+ /* The following are not initialized to meaningful values until a disk
+ * bitmap is provided. */
+ u32 m_bitmap_len; /* Number of valid
+ * bits available */
+
+ struct list_head m_lru; /* LRU of reservations
+ * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen);
+
+#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
new file mode 100644
index 00000000000..d5da6f62414
--- /dev/null
+++ b/fs/ocfs2/resize.c
@@ -0,0 +1,589 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * resize.c
+ *
+ * volume resize.
+ * Inspired by ext3/resize.c.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "super.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "ocfs2_trace.h"
+
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "resize.h"
+
+/*
+ * Check whether there are new backup superblocks exist
+ * in the last group. If there are some, mark them or clear
+ * them in the bitmap.
+ *
+ * Return how many backups we find in the last group.
+ */
+static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+ struct ocfs2_group_desc *gd,
+ u16 cl_cpg,
+ int set)
+{
+ int i;
+ u16 backups = 0;
+ u32 cluster;
+ u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+ blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+ cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+ gd_blkno = ocfs2_which_cluster_group(inode, cluster);
+ if (gd_blkno < lgd_blkno)
+ continue;
+ else if (gd_blkno > lgd_blkno)
+ break;
+
+ if (set)
+ ocfs2_set_bit(cluster % cl_cpg,
+ (unsigned long *)gd->bg_bitmap);
+ else
+ ocfs2_clear_bit(cluster % cl_cpg,
+ (unsigned long *)gd->bg_bitmap);
+ backups++;
+ }
+
+ return backups;
+}
+
+static int ocfs2_update_last_group_and_inode(handle_t *handle,
+ struct inode *bm_inode,
+ struct buffer_head *bm_bh,
+ struct buffer_head *group_bh,
+ u32 first_new_cluster,
+ int new_clusters)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(bm_inode->i_sb);
+ struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bm_bh->b_data;
+ struct ocfs2_chain_list *cl = &fe->id2.i_chain;
+ struct ocfs2_chain_rec *cr;
+ struct ocfs2_group_desc *group;
+ u16 chain, num_bits, backups = 0;
+ u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+ u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+
+ trace_ocfs2_update_last_group_and_inode(new_clusters,
+ first_new_cluster);
+
+ ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
+ group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+ /* update the group first. */
+ num_bits = new_clusters * cl_bpc;
+ le16_add_cpu(&group->bg_bits, num_bits);
+ le16_add_cpu(&group->bg_free_bits_count, num_bits);
+
+ /*
+ * check whether there are some new backup superblocks exist in
+ * this group and update the group bitmap accordingly.
+ */
+ if (OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+ backups = ocfs2_calc_new_backup_super(bm_inode,
+ group,
+ cl_cpg, 1);
+ le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+ }
+
+ ocfs2_journal_dirty(handle, group_bh);
+
+ /* update the inode accordingly. */
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_rollback;
+ }
+
+ chain = le16_to_cpu(group->bg_chain);
+ cr = (&cl->cl_recs[chain]);
+ le32_add_cpu(&cr->c_total, num_bits);
+ le32_add_cpu(&cr->c_free, num_bits);
+ le32_add_cpu(&fe->id1.bitmap1.i_total, num_bits);
+ le32_add_cpu(&fe->i_clusters, new_clusters);
+
+ if (backups) {
+ le32_add_cpu(&cr->c_free, -1 * backups);
+ le32_add_cpu(&fe->id1.bitmap1.i_used, backups);
+ }
+
+ spin_lock(&OCFS2_I(bm_inode)->ip_lock);
+ OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+ le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
+ spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
+ i_size_write(bm_inode, le64_to_cpu(fe->i_size));
+
+ ocfs2_journal_dirty(handle, bm_bh);
+
+out_rollback:
+ if (ret < 0) {
+ ocfs2_calc_new_backup_super(bm_inode,
+ group,
+ cl_cpg, 0);
+ le16_add_cpu(&group->bg_free_bits_count, backups);
+ le16_add_cpu(&group->bg_bits, -1 * num_bits);
+ le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
+ }
+out:
+ if (ret)
+ mlog_errno(ret);
+ return ret;
+}
+
+static int update_backups(struct inode * inode, u32 clusters, char *data)
+{
+ int i, ret = 0;
+ u32 cluster;
+ u64 blkno;
+ struct buffer_head *backup = NULL;
+ struct ocfs2_dinode *backup_di = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ /* calculate the real backups we need to update. */
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+ blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
+ cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+ if (cluster > clusters)
+ break;
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ memcpy(backup->b_data, data, inode->i_sb->s_blocksize);
+
+ backup_di = (struct ocfs2_dinode *)backup->b_data;
+ backup_di->i_blkno = cpu_to_le64(blkno);
+
+ ret = ocfs2_write_super_or_backup(osb, backup);
+ brelse(backup);
+ backup = NULL;
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static void ocfs2_update_super_and_backups(struct inode *inode,
+ int new_clusters)
+{
+ int ret;
+ u32 clusters = 0;
+ struct buffer_head *super_bh = NULL;
+ struct ocfs2_dinode *super_di = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ /*
+ * update the superblock last.
+ * It doesn't matter if the write failed.
+ */
+ ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+ &super_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ super_di = (struct ocfs2_dinode *)super_bh->b_data;
+ le32_add_cpu(&super_di->i_clusters, new_clusters);
+ clusters = le32_to_cpu(super_di->i_clusters);
+
+ ret = ocfs2_write_super_or_backup(osb, super_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_BACKUP_SB))
+ ret = update_backups(inode, clusters, super_bh->b_data);
+
+out:
+ brelse(super_bh);
+ if (ret)
+ printk(KERN_WARNING "ocfs2: Failed to update super blocks on %s"
+ " during fs resize. This condition is not fatal,"
+ " but fsck.ocfs2 should be run to fix it\n",
+ osb->dev_str);
+ return;
+}
+
+/*
+ * Extend the filesystem to the new number of clusters specified. This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.
+ */
+int ocfs2_group_extend(struct inode * inode, int new_clusters)
+{
+ int ret;
+ handle_t *handle;
+ struct buffer_head *main_bm_bh = NULL;
+ struct buffer_head *group_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct ocfs2_dinode *fe = NULL;
+ struct ocfs2_group_desc *group = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u16 cl_bpc;
+ u32 first_new_cluster;
+ u64 lgd_blkno;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ if (new_clusters < 0)
+ return -EINVAL;
+ else if (new_clusters == 0)
+ return 0;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+ * so any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
+ if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
+ mlog(ML_ERROR, "The disk is too old and small. "
+ "Force to do offline resize.");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ first_new_cluster = le32_to_cpu(fe->i_clusters);
+ lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
+ first_new_cluster - 1);
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+ &group_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+
+ cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+ if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
+ le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+
+ trace_ocfs2_group_extend(
+ (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
+
+ handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
+ if (IS_ERR(handle)) {
+ mlog_errno(PTR_ERR(handle));
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* update the last group descriptor and inode. */
+ ret = ocfs2_update_last_group_and_inode(handle, main_bm_inode,
+ main_bm_bh, group_bh,
+ first_new_cluster,
+ new_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_update_super_and_backups(main_bm_inode, new_clusters);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ brelse(group_bh);
+ brelse(main_bm_bh);
+
+ ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+
+out:
+ return ret;
+}
+
+static int ocfs2_check_new_group(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_new_group_input *input,
+ struct buffer_head *group_bh)
+{
+ int ret;
+ struct ocfs2_group_desc *gd =
+ (struct ocfs2_group_desc *)group_bh->b_data;
+ u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
+
+ ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (le16_to_cpu(gd->bg_chain) != input->chain)
+ mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
+ "while input has %u set.\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_chain), input->chain);
+ else if (le16_to_cpu(gd->bg_bits) != input->clusters * cl_bpc)
+ mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
+ "input has %u clusters set\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits), input->clusters);
+ else if (le16_to_cpu(gd->bg_free_bits_count) != input->frees * cl_bpc)
+ mlog(ML_ERROR, "Group descriptor # %llu has free bit count %u "
+ "but it should have %u set\n",
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ input->frees * cl_bpc);
+ else
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int ocfs2_verify_group_and_input(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_new_group_input *input,
+ struct buffer_head *group_bh)
+{
+ u16 cl_count = le16_to_cpu(di->id2.i_chain.cl_count);
+ u16 cl_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
+ u16 next_free = le16_to_cpu(di->id2.i_chain.cl_next_free_rec);
+ u32 cluster = ocfs2_blocks_to_clusters(inode->i_sb, input->group);
+ u32 total_clusters = le32_to_cpu(di->i_clusters);
+ int ret = -EINVAL;
+
+ if (cluster < total_clusters)
+ mlog(ML_ERROR, "add a group which is in the current volume.\n");
+ else if (input->chain >= cl_count)
+ mlog(ML_ERROR, "input chain exceeds the limit.\n");
+ else if (next_free != cl_count && next_free != input->chain)
+ mlog(ML_ERROR,
+ "the add group should be in chain %u\n", next_free);
+ else if (total_clusters + input->clusters < total_clusters)
+ mlog(ML_ERROR, "add group's clusters overflow.\n");
+ else if (input->clusters > cl_cpg)
+ mlog(ML_ERROR, "the cluster exceeds the maximum of a group\n");
+ else if (input->frees > input->clusters)
+ mlog(ML_ERROR, "the free cluster exceeds the total clusters\n");
+ else if (total_clusters % cl_cpg != 0)
+ mlog(ML_ERROR,
+ "the last group isn't full. Use group extend first.\n");
+ else if (input->group != ocfs2_which_cluster_group(inode, cluster))
+ mlog(ML_ERROR, "group blkno is invalid\n");
+ else if ((ret = ocfs2_check_new_group(inode, di, input, group_bh)))
+ mlog(ML_ERROR, "group descriptor check failed.\n");
+ else
+ ret = 0;
+
+ return ret;
+}
+
+/* Add a new group descriptor to global_bitmap. */
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
+{
+ int ret;
+ handle_t *handle;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct ocfs2_dinode *fe = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *group_bh = NULL;
+ struct ocfs2_group_desc *group = NULL;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_chain_rec *cr;
+ u16 cl_bpc;
+ u64 bg_ptr;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
+ mlog(ML_ERROR, "The disk is too old and small."
+ " Force to do offline resize.");
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
+ if (ret < 0) {
+ mlog(ML_ERROR, "Can't read the group descriptor # %llu "
+ "from the device.", (unsigned long long)input->group);
+ goto out_unlock;
+ }
+
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
+
+ ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_free_group_bh;
+ }
+
+ trace_ocfs2_group_add((unsigned long long)input->group,
+ input->chain, input->clusters, input->frees);
+
+ handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
+ if (IS_ERR(handle)) {
+ mlog_errno(PTR_ERR(handle));
+ ret = -EINVAL;
+ goto out_free_group_bh;
+ }
+
+ cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
+ cl = &fe->id2.i_chain;
+ cr = &cl->cl_recs[input->chain];
+
+ ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
+ group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+ bg_ptr = le64_to_cpu(group->bg_next_group);
+ group->bg_next_group = cr->c_blkno;
+ ocfs2_journal_dirty(handle, group_bh);
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
+ main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ group->bg_next_group = cpu_to_le64(bg_ptr);
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (input->chain == le16_to_cpu(cl->cl_next_free_rec)) {
+ le16_add_cpu(&cl->cl_next_free_rec, 1);
+ memset(cr, 0, sizeof(struct ocfs2_chain_rec));
+ }
+
+ cr->c_blkno = cpu_to_le64(input->group);
+ le32_add_cpu(&cr->c_total, input->clusters * cl_bpc);
+ le32_add_cpu(&cr->c_free, input->frees * cl_bpc);
+
+ le32_add_cpu(&fe->id1.bitmap1.i_total, input->clusters *cl_bpc);
+ le32_add_cpu(&fe->id1.bitmap1.i_used,
+ (input->clusters - input->frees) * cl_bpc);
+ le32_add_cpu(&fe->i_clusters, input->clusters);
+
+ ocfs2_journal_dirty(handle, main_bm_bh);
+
+ spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
+ OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+ le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
+ spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
+ i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
+
+ ocfs2_update_super_and_backups(main_bm_inode, input->clusters);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_free_group_bh:
+ brelse(group_bh);
+
+out_unlock:
+ brelse(main_bm_bh);
+
+ ocfs2_inode_unlock(main_bm_inode, 1);
+
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+
+out:
+ return ret;
+}
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/resize.h
index f35eadbed25..f38841abf10 100644
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ b/fs/ocfs2/resize.h
@@ -1,11 +1,11 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
- * dlmver.h
+ * resize.h
*
* Function prototypes
*
- * Copyright (C) 2005 Oracle. All rights reserved.
+ * Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -23,9 +23,10 @@
* Boston, MA 021110-1307, USA.
*/
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
+#ifndef OCFS2_RESIZE_H
+#define OCFS2_RESIZE_H
-void dlmfs_print_version(void);
+int ocfs2_group_extend(struct inode * inode, int new_clusters);
+int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input);
-#endif /* DLMFS_VER_H */
+#endif /* OCFS2_RESIZE_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index aa6f5aadedc..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -26,9 +26,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -40,154 +38,402 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global);
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
- s16 slot_num,
- s16 node_num);
-
-/* Use the slot information we've collected to create a map of mounted
- * nodes. Should be holding an EX on super block. assumes slot info is
- * up to date. Note that we call this *after* we find a slot, so our
- * own node should be set in the map too... */
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
+
+struct ocfs2_slot {
+ int sl_valid;
+ unsigned int sl_node_num;
+};
+
+struct ocfs2_slot_info {
+ int si_extended;
+ int si_slots_per_block;
+ struct inode *si_inode;
+ unsigned int si_blocks;
+ struct buffer_head **si_bh;
+ unsigned int si_num_slots;
+ struct ocfs2_slot *si_slots;
+};
+
+
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+ int slot_num)
{
- int i;
- struct ocfs2_slot_info *si = osb->slot_info;
+ BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+ si->si_slots[slot_num].sl_valid = 0;
+}
- spin_lock(&si->si_lock);
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+ int slot_num, unsigned int node_num)
+{
+ BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
- for (i = 0; i < si->si_size; i++)
- if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
- ocfs2_node_map_set_bit(osb, &osb->mounted_map,
- si->si_global_node_nums[i]);
+ si->si_slots[slot_num].sl_valid = 1;
+ si->si_slots[slot_num].sl_node_num = node_num;
+}
- spin_unlock(&si->si_lock);
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+ int b, i, slotno;
+ struct ocfs2_slot_map_extended *se;
+
+ slotno = 0;
+ for (b = 0; b < si->si_blocks; b++) {
+ se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+ for (i = 0;
+ (i < si->si_slots_per_block) &&
+ (slotno < si->si_num_slots);
+ i++, slotno++) {
+ if (se->se_slots[i].es_valid)
+ ocfs2_set_slot(si, slotno,
+ le32_to_cpu(se->se_slots[i].es_node_num));
+ else
+ ocfs2_invalidate_slot(si, slotno);
+ }
+ }
}
-/* post the slot information on disk into our slot_info struct. */
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
{
int i;
- __le16 *disk_info;
+ struct ocfs2_slot_map *sm;
+
+ sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
- /* we don't read the slot block here as ocfs2_super_lock
- * should've made sure we have the most recent copy. */
- spin_lock(&si->si_lock);
- disk_info = (__le16 *) si->si_bh->b_data;
+ for (i = 0; i < si->si_num_slots; i++) {
+ if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+ ocfs2_invalidate_slot(si, i);
+ else
+ ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+ }
+}
+
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+ /*
+ * The slot data will have been refreshed when ocfs2_super_lock
+ * was taken.
+ */
+ if (si->si_extended)
+ ocfs2_update_slot_info_extended(si);
+ else
+ ocfs2_update_slot_info_old(si);
+}
- for (i = 0; i < si->si_size; i++)
- si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+ int ret;
+ struct ocfs2_slot_info *si = osb->slot_info;
- spin_unlock(&si->si_lock);
+ if (si == NULL)
+ return 0;
+
+ BUG_ON(si->si_blocks == 0);
+ BUG_ON(si->si_bh == NULL);
+
+ trace_ocfs2_refresh_slot_info(si->si_blocks);
+
+ /*
+ * We pass -1 as blocknr because we expect all of si->si_bh to
+ * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
+ * this is not true, the read of -1 (UINT64_MAX) will fail.
+ */
+ ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
+ si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
+ if (ret == 0) {
+ spin_lock(&osb->osb_lock);
+ ocfs2_update_slot_info(si);
+ spin_unlock(&osb->osb_lock);
+ }
+
+ return ret;
}
/* post the our slot info stuff into it's destination bh and write it
* out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si)
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+ int slot_num,
+ struct buffer_head **bh)
{
- int status, i;
- __le16 *disk_info = (__le16 *) si->si_bh->b_data;
+ int blkind = slot_num / si->si_slots_per_block;
+ int slotno = slot_num % si->si_slots_per_block;
+ struct ocfs2_slot_map_extended *se;
+
+ BUG_ON(blkind >= si->si_blocks);
+
+ se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+ se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+ if (si->si_slots[slot_num].sl_valid)
+ se->se_slots[slotno].es_node_num =
+ cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+ *bh = si->si_bh[blkind];
+}
- spin_lock(&si->si_lock);
- for (i = 0; i < si->si_size; i++)
- disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
- spin_unlock(&si->si_lock);
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+ int slot_num,
+ struct buffer_head **bh)
+{
+ int i;
+ struct ocfs2_slot_map *sm;
+
+ sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+ for (i = 0; i < si->si_num_slots; i++) {
+ if (si->si_slots[i].sl_valid)
+ sm->sm_slots[i] =
+ cpu_to_le16(si->si_slots[i].sl_node_num);
+ else
+ sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+ }
+ *bh = si->si_bh[0];
+}
- status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si,
+ int slot_num)
+{
+ int status;
+ struct buffer_head *bh;
+
+ spin_lock(&osb->osb_lock);
+ if (si->si_extended)
+ ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+ else
+ ocfs2_update_disk_slot_old(si, slot_num, &bh);
+ spin_unlock(&osb->osb_lock);
+
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
mlog_errno(status);
return status;
}
-/* try to find global node in the slot info. Returns
- * OCFS2_INVALID_SLOT if nothing is found. */
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global)
+/*
+ * Calculate how many bytes are needed by the slot map. Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+ struct inode *inode,
+ unsigned long long *bytes)
{
- int i;
- s16 ret = OCFS2_INVALID_SLOT;
+ unsigned long long bytes_needed;
+
+ if (ocfs2_uses_extended_slot_map(osb)) {
+ bytes_needed = osb->max_slots *
+ sizeof(struct ocfs2_extended_slot);
+ } else {
+ bytes_needed = osb->max_slots * sizeof(__le16);
+ }
+ if (bytes_needed > i_size_read(inode)) {
+ mlog(ML_ERROR,
+ "Slot map file is too small! (size %llu, needed %llu)\n",
+ i_size_read(inode), bytes_needed);
+ return -ENOSPC;
+ }
+
+ *bytes = bytes_needed;
+ return 0;
+}
+
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+ unsigned int node_num)
+{
+ int i, ret = -ENOENT;
for(i = 0; i < si->si_num_slots; i++) {
- if (global == si->si_global_node_nums[i]) {
- ret = (s16) i;
+ if (si->si_slots[i].sl_valid &&
+ (node_num == si->si_slots[i].sl_node_num)) {
+ ret = i;
break;
}
}
+
return ret;
}
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+ int preferred)
{
- int i;
- s16 ret = OCFS2_INVALID_SLOT;
+ int i, ret = -ENOSPC;
+
+ if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+ if (!si->si_slots[preferred].sl_valid) {
+ ret = preferred;
+ goto out;
+ }
+ }
for(i = 0; i < si->si_num_slots; i++) {
- if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
- ret = (s16) i;
+ if (!si->si_slots[i].sl_valid) {
+ ret = i;
break;
}
}
+out:
return ret;
}
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global)
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
{
- s16 ret;
+ int slot;
+ struct ocfs2_slot_info *si = osb->slot_info;
- spin_lock(&si->si_lock);
- ret = __ocfs2_node_num_to_slot(si, global);
- spin_unlock(&si->si_lock);
- return ret;
+ spin_lock(&osb->osb_lock);
+ slot = __ocfs2_node_num_to_slot(si, node_num);
+ spin_unlock(&osb->osb_lock);
+
+ return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+ unsigned int *node_num)
+{
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ assert_spin_locked(&osb->osb_lock);
+
+ BUG_ON(slot_num < 0);
+ BUG_ON(slot_num > osb->max_slots);
+
+ if (!si->si_slots[slot_num].sl_valid)
+ return -ENOENT;
+
+ *node_num = si->si_slots[slot_num].sl_node_num;
+ return 0;
}
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
- s16 slot_num,
- s16 node_num)
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
{
- BUG_ON(slot_num == OCFS2_INVALID_SLOT);
- BUG_ON(slot_num >= si->si_num_slots);
- BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
- (node_num >= O2NM_MAX_NODES));
+ unsigned int i;
- si->si_global_node_nums[slot_num] = node_num;
+ if (si == NULL)
+ return;
+
+ if (si->si_inode)
+ iput(si->si_inode);
+ if (si->si_bh) {
+ for (i = 0; i < si->si_blocks; i++) {
+ if (si->si_bh[i]) {
+ brelse(si->si_bh[i]);
+ si->si_bh[i] = NULL;
+ }
+ }
+ kfree(si->si_bh);
+ }
+
+ kfree(si);
}
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
- s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
{
- spin_lock(&si->si_lock);
- __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
- spin_unlock(&si->si_lock);
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ if (si == NULL)
+ return 0;
+
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, slot_num);
+ spin_unlock(&osb->osb_lock);
+
+ return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
}
-int ocfs2_init_slot_info(struct ocfs2_super *osb)
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+ struct ocfs2_slot_info *si)
{
- int status, i;
+ int status = 0;
u64 blkno;
+ unsigned long long blocks, bytes = 0;
+ unsigned int i;
+ struct buffer_head *bh;
+
+ status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+ if (status)
+ goto bail;
+
+ blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+ BUG_ON(blocks > UINT_MAX);
+ si->si_blocks = blocks;
+ if (!si->si_blocks)
+ goto bail;
+
+ if (si->si_extended)
+ si->si_slots_per_block =
+ (osb->sb->s_blocksize /
+ sizeof(struct ocfs2_extended_slot));
+ else
+ si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+ /* The size checks above should ensure this */
+ BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
+ trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
+
+ si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+ GFP_KERNEL);
+ if (!si->si_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ for (i = 0; i < si->si_blocks; i++) {
+ status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+ &blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
+
+ bh = NULL; /* Acquire a fresh bh */
+ status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
+ 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ si->si_bh[i] = bh;
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+ int status;
struct inode *inode = NULL;
- struct buffer_head *bh = NULL;
struct ocfs2_slot_info *si;
- si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+ si = kzalloc(sizeof(struct ocfs2_slot_info) +
+ (sizeof(struct ocfs2_slot) * osb->max_slots),
+ GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- spin_lock_init(&si->si_lock);
+ si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_size = OCFS2_MAX_SLOTS;
-
- for(i = 0; i < si->si_num_slots; i++)
- si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+ si->si_slots = (struct ocfs2_slot *)((char *)si +
+ sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -197,107 +443,96 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+ si->si_inode = inode;
+ status = ocfs2_map_slot_buffers(osb, si);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- si->si_inode = inode;
- si->si_bh = bh;
- osb->slot_info = si;
+ osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
if (status < 0 && si)
- ocfs2_free_slot_info(si);
+ __ocfs2_free_slot_info(si);
return status;
}
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
{
- if (si->si_inode)
- iput(si->si_inode);
- if (si->si_bh)
- brelse(si->si_bh);
- kfree(si);
+ struct ocfs2_slot_info *si = osb->slot_info;
+
+ osb->slot_info = NULL;
+ __ocfs2_free_slot_info(si);
}
int ocfs2_find_slot(struct ocfs2_super *osb)
{
int status;
- s16 slot;
+ int slot;
struct ocfs2_slot_info *si;
- mlog_entry_void();
-
si = osb->slot_info;
+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- spin_lock(&si->si_lock);
/* search for ourselves first and take the slot if it already
* exists. Perhaps we need to mark this in a variable for our
* own journal recovery? Possibly not, though we certainly
* need to warn to the user */
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
- if (slot == OCFS2_INVALID_SLOT) {
+ if (slot < 0) {
/* if no slot yet, then just take 1st available
* one. */
- slot = __ocfs2_find_empty_slot(si);
- if (slot == OCFS2_INVALID_SLOT) {
- spin_unlock(&si->si_lock);
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
+ if (slot < 0) {
+ spin_unlock(&osb->osb_lock);
mlog(ML_ERROR, "no free slots available!\n");
status = -EINVAL;
goto bail;
}
} else
- mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
- slot);
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
- __ocfs2_fill_slot(si, slot, osb->node_num);
+ ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
- mlog(0, "taking node slot %d\n", osb->slot_num);
+ trace_ocfs2_find_slot(osb->slot_num);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0)
mlog_errno(status);
bail:
- mlog_exit(status);
return status;
}
void ocfs2_put_slot(struct ocfs2_super *osb)
{
- int status;
+ int status, slot_num;
struct ocfs2_slot_info *si = osb->slot_info;
if (!si)
return;
+ spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- spin_lock(&si->si_lock);
- __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+ slot_num = osb->slot_num;
+ ocfs2_invalidate_slot(si, osb->slot_num);
osb->slot_num = OCFS2_INVALID_SLOT;
- spin_unlock(&si->si_lock);
+ spin_unlock(&osb->osb_lock);
- status = ocfs2_update_disk_slots(osb, si);
+ status = ocfs2_update_disk_slot(osb, si, slot_num);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bail:
- osb->slot_info = NULL;
- ocfs2_free_slot_info(si);
+ ocfs2_free_slot_info(osb);
}
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index d8c8ceed031..601c95fd700 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,40 +27,18 @@
#ifndef SLOTMAP_H
#define SLOTMAP_H
-struct ocfs2_slot_info {
- spinlock_t si_lock;
-
- struct inode *si_inode;
- struct buffer_head *si_bh;
- unsigned int si_num_slots;
- unsigned int si_size;
- s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
-
int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
int ocfs2_find_slot(struct ocfs2_super *osb);
void ocfs2_put_slot(struct ocfs2_super *osb);
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
- struct ocfs2_slot_info *si);
-
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
- s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
- s16 slot_num);
-
-void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
- int slot_num)
-{
- BUG_ON(slot_num == OCFS2_INVALID_SLOT);
- assert_spin_locked(&si->si_lock);
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+ unsigned int *node_num);
- return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
-}
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
#endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 00000000000..1724d43d3da
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "stackglue.h"
+
+struct o2dlm_private {
+ struct dlm_eviction_cb op_eviction_cb;
+};
+
+static struct ocfs2_stack_plugin o2cb_stack;
+
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+ BUG_ON(mode > LKM_MAXMODE);
+
+ return mode;
+}
+
+#define map_flag(_generic, _o2dlm) \
+ if (flags & (_generic)) { \
+ flags &= ~(_generic); \
+ o2dlm_flags |= (_o2dlm); \
+ }
+static int flags_to_o2dlm(u32 flags)
+{
+ int o2dlm_flags = 0;
+
+ map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+ map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+ map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+ map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+ map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+ map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+ map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+ map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+ map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+ /* map_flag() should have cleared every flag passed in */
+ BUG_ON(flags != 0);
+
+ return o2dlm_flags;
+}
+#undef map_flag
+
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL: 0
+ * DLM_NOTQUEUED: -EAGAIN
+ * DLM_CANCELGRANT: -EBUSY
+ * DLM_CANCEL: -DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+ [DLM_NORMAL] = 0, /* Success */
+ [DLM_GRANTED] = -EINVAL,
+ [DLM_DENIED] = -EACCES,
+ [DLM_DENIED_NOLOCKS] = -EACCES,
+ [DLM_WORKING] = -EACCES,
+ [DLM_BLOCKED] = -EINVAL,
+ [DLM_BLOCKED_ORPHAN] = -EINVAL,
+ [DLM_DENIED_GRACE_PERIOD] = -EACCES,
+ [DLM_SYSERR] = -ENOMEM, /* It is what it is */
+ [DLM_NOSUPPORT] = -EPROTO,
+ [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */
+ [DLM_IVLOCKID] = -EINVAL,
+ [DLM_SYNC] = -EINVAL,
+ [DLM_BADTYPE] = -EINVAL,
+ [DLM_BADRESOURCE] = -EINVAL,
+ [DLM_MAXHANDLES] = -ENOMEM,
+ [DLM_NOCLINFO] = -EINVAL,
+ [DLM_NOLOCKMGR] = -EINVAL,
+ [DLM_NOPURGED] = -EINVAL,
+ [DLM_BADARGS] = -EINVAL,
+ [DLM_VOID] = -EINVAL,
+ [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */
+ [DLM_IVBUFLEN] = -EINVAL,
+ [DLM_CVTUNGRANT] = -EPERM,
+ [DLM_BADPARAM] = -EINVAL,
+ [DLM_VALNOTVALID] = -EINVAL,
+ [DLM_REJECTED] = -EPERM,
+ [DLM_ABORT] = -EINVAL,
+ [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */
+ [DLM_IVRESHANDLE] = -EINVAL,
+ [DLM_DEADLOCK] = -EDEADLK,
+ [DLM_DENIED_NOASTS] = -EINVAL,
+ [DLM_FORWARD] = -EINVAL,
+ [DLM_TIMEOUT] = -ETIMEDOUT,
+ [DLM_IVGROUPID] = -EINVAL,
+ [DLM_VERS_CONFLICT] = -EOPNOTSUPP,
+ [DLM_BAD_DEVICE_PATH] = -ENOENT,
+ [DLM_NO_DEVICE_PERMISSION] = -EPERM,
+ [DLM_NO_CONTROL_DEVICE] = -ENOENT,
+ [DLM_RECOVERING] = -ENOTCONN,
+ [DLM_MIGRATING] = -ERESTART,
+ [DLM_MAXSTATS] = -EINVAL,
+};
+
+static int dlm_status_to_errno(enum dlm_status status)
+{
+ BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
+
+ return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+ struct ocfs2_dlm_lksb *lksb = astarg;
+
+ lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+ struct ocfs2_dlm_lksb *lksb = astarg;
+
+ lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+ struct ocfs2_dlm_lksb *lksb = astarg;
+ int error = dlm_status_to_errno(status);
+
+ /*
+ * In o2dlm, you can get both the lock_ast() for the lock being
+ * granted and the unlock_ast() for the CANCEL failing. A
+ * successful cancel sends DLM_NORMAL here. If the
+ * lock grant happened before the cancel arrived, you get
+ * DLM_CANCELGRANT.
+ *
+ * There's no need for the double-ast. If we see DLM_CANCELGRANT,
+ * we just ignore it. We expect the lock_ast() to handle the
+ * granted lock.
+ */
+ if (status == DLM_CANCELGRANT)
+ return;
+
+ lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
+}
+
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen)
+{
+ enum dlm_status status;
+ int o2dlm_mode = mode_to_o2dlm(mode);
+ int o2dlm_flags = flags_to_o2dlm(flags);
+ int ret;
+
+ status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+ o2dlm_flags, name, namelen,
+ o2dlm_lock_ast_wrapper, lksb,
+ o2dlm_blocking_ast_wrapper);
+ ret = dlm_status_to_errno(status);
+ return ret;
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags)
+{
+ enum dlm_status status;
+ int o2dlm_flags = flags_to_o2dlm(flags);
+ int ret;
+
+ status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+ o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
+ ret = dlm_status_to_errno(status);
+ return ret;
+}
+
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+ return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+/*
+ * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * contents, it will zero out the LVB. Thus the caller can always trust
+ * the contents.
+ */
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+ return 1;
+}
+
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+ return (void *)(lksb->lksb_o2dlm.lvb);
+}
+
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+ dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
+/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+ u8 node_num;
+ int i;
+ unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_MAX_NODES) {
+ printk(KERN_ERR "o2cb: This node has not been configured.\n");
+ return -EINVAL;
+ }
+
+ /*
+ * o2dlm expects o2net sockets to be created. If not, then
+ * dlm_join_domain() fails with a stack of errors which are both cryptic
+ * and incomplete. The idea here is to detect upfront whether we have
+ * managed to connect to all nodes or not. If not, then list the nodes
+ * to allow the user to check the configuration (incorrect IP, firewall,
+ * etc.) Yes, this is racy. But its not the end of the world.
+ */
+#define O2CB_MAP_STABILIZE_COUNT 60
+ for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+ o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ if (!test_bit(node_num, hbmap)) {
+ printk(KERN_ERR "o2cb: %s heartbeat has not been "
+ "started.\n", (o2hb_global_heartbeat_active() ?
+ "Global" : "Local"));
+ return -EINVAL;
+ }
+ o2net_fill_node_map(netmap, sizeof(netmap));
+ /* Force set the current node to allow easy compare */
+ set_bit(node_num, netmap);
+ if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ return 0;
+ if (i < O2CB_MAP_STABILIZE_COUNT)
+ msleep(1000);
+ }
+
+ printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+ i = -1;
+ while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (!test_bit(i, netmap))
+ printk(" %u", i);
+ }
+ printk(".\n");
+
+ return -ENOTCONN;
+}
+
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+ struct ocfs2_cluster_connection *conn = data;
+
+ printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
+
+ conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+ int rc = 0;
+ u32 dlm_key;
+ struct dlm_ctxt *dlm;
+ struct o2dlm_private *priv;
+ struct dlm_protocol_version fs_version;
+
+ BUG_ON(conn == NULL);
+ BUG_ON(conn->cc_proto == NULL);
+
+ /* Ensure cluster stack is up and all nodes are connected */
+ rc = o2cb_cluster_check();
+ if (rc) {
+ printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+ "before retrying.\n");
+ goto out;
+ }
+
+ priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+ if (!priv) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ /* This just fills the structure in. It is safe to pass conn. */
+ dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+ conn);
+
+ conn->cc_private = priv;
+
+ /* used by the dlm code to make message headers unique, each
+ * node in this domain must agree on this. */
+ dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+ fs_version.pv_major = conn->cc_version.pv_major;
+ fs_version.pv_minor = conn->cc_version.pv_minor;
+
+ dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
+ if (IS_ERR(dlm)) {
+ rc = PTR_ERR(dlm);
+ mlog_errno(rc);
+ goto out_free;
+ }
+
+ conn->cc_version.pv_major = fs_version.pv_major;
+ conn->cc_version.pv_minor = fs_version.pv_minor;
+ conn->cc_lockspace = dlm;
+
+ dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+ if (rc)
+ kfree(conn->cc_private);
+
+out:
+ return rc;
+}
+
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ struct dlm_ctxt *dlm = conn->cc_lockspace;
+ struct o2dlm_private *priv = conn->cc_private;
+
+ dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+ conn->cc_private = NULL;
+ kfree(priv);
+
+ dlm_unregister_domain(dlm);
+ conn->cc_lockspace = NULL;
+
+ return 0;
+}
+
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
+{
+ int node_num;
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_INVALID_NODE_NUM)
+ return -ENOENT;
+
+ if (node_num >= O2NM_MAX_NODES)
+ return -EOVERFLOW;
+
+ *node = node_num;
+ return 0;
+}
+
+static struct ocfs2_stack_operations o2cb_stack_ops = {
+ .connect = o2cb_cluster_connect,
+ .disconnect = o2cb_cluster_disconnect,
+ .this_node = o2cb_cluster_this_node,
+ .dlm_lock = o2cb_dlm_lock,
+ .dlm_unlock = o2cb_dlm_unlock,
+ .lock_status = o2cb_dlm_lock_status,
+ .lvb_valid = o2cb_dlm_lvb_valid,
+ .lock_lvb = o2cb_dlm_lvb,
+ .dump_lksb = o2cb_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin o2cb_stack = {
+ .sp_name = "o2cb",
+ .sp_ops = &o2cb_stack_ops,
+ .sp_owner = THIS_MODULE,
+};
+
+static int __init o2cb_stack_init(void)
+{
+ return ocfs2_stack_glue_register(&o2cb_stack);
+}
+
+static void __exit o2cb_stack_exit(void)
+{
+ ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 00000000000..13a8537d8e8
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,1136 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+#include "stackglue.h"
+
+#include <linux/dlm_plock.h>
+
+/*
+ * The control protocol starts with a handshake. Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple. First, the client reads until EOF. Each line
+ * of output is a supported protocol tag. All protocol tags are a single
+ * character followed by a two hex digit version number. Currently the
+ * only things supported is T01, for "Text-base version 0x01". Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'. If the version tag written is
+ * unknown, -EINVAL is returned. Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages. First is the "SETN" message.
+ * It has the following syntax:
+ *
+ * SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message. It has the following syntax:
+ *
+ * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client. The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
+ * number from the "SETV" message must match
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
+ * must be less than or equal to ...sp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed. From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO "T01\n"
+#define OCFS2_CONTROL_PROTO_LEN 4
+
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID (0)
+#define OCFS2_CONTROL_HANDSHAKE_READ (1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID (3)
+
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN 4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47
+#define OCFS2_TEXT_UUID_LEN 32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
+
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order. Let's just be safe.
+ */
+struct ocfs2_live_connection {
+ struct list_head oc_list;
+ struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
+};
+
+struct ocfs2_control_private {
+ struct list_head op_list;
+ int op_state;
+ int op_this_node;
+ struct ocfs2_protocol_version op_proto;
+};
+
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ char space;
+ char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+ char newline;
+};
+
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ char space1;
+ char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+ char space2;
+ char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+ char newline;
+};
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ char space1;
+ char uuid[OCFS2_TEXT_UUID_LEN];
+ char space2;
+ char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+ char newline;
+};
+
+union ocfs2_control_message {
+ char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+ struct ocfs2_control_message_setn u_setn;
+ struct ocfs2_control_message_setv u_setv;
+ struct ocfs2_control_message_down u_down;
+};
+
+static struct ocfs2_stack_plugin ocfs2_user_plugin;
+
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+ int state)
+{
+ struct ocfs2_control_private *p = file->private_data;
+ p->op_state = state;
+}
+
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+ struct ocfs2_control_private *p = file->private_data;
+ return p->op_state;
+}
+
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+ size_t len = strlen(name);
+ struct ocfs2_live_connection *c;
+
+ BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+
+ list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+ if ((c->oc_conn->cc_namelen == len) &&
+ !strncmp(c->oc_conn->cc_name, name, len))
+ return c;
+ }
+
+ return NULL;
+}
+
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path. Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
+{
+ int rc = 0;
+
+ mutex_lock(&ocfs2_control_lock);
+ c->oc_conn = conn;
+
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
+ list_add(&c->oc_list, &ocfs2_live_connection_list);
+ else {
+ printk(KERN_ERR
+ "ocfs2: Userspace control daemon is not present\n");
+ rc = -ESRCH;
+ }
+
+ mutex_unlock(&ocfs2_control_lock);
+ return rc;
+}
+
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+ mutex_lock(&ocfs2_control_lock);
+ list_del_init(&c->oc_list);
+ c->oc_conn = NULL;
+ mutex_unlock(&ocfs2_control_lock);
+
+ kfree(c);
+}
+
+static int ocfs2_control_cfu(void *target, size_t target_len,
+ const char __user *buf, size_t count)
+{
+ /* The T01 expects write(2) calls to have exactly one command */
+ if ((count != target_len) ||
+ (count > sizeof(union ocfs2_control_message)))
+ return -EINVAL;
+
+ if (copy_from_user(target, buf, target_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+ const char __user *buf,
+ size_t count)
+{
+ ssize_t ret;
+ char kbuf[OCFS2_CONTROL_PROTO_LEN];
+
+ ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+ buf, count);
+ if (ret)
+ return ret;
+
+ if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+ return -EINVAL;
+
+ ocfs2_control_set_handshake_state(file,
+ OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+ return count;
+}
+
+static void ocfs2_control_send_down(const char *uuid,
+ int nodenum)
+{
+ struct ocfs2_live_connection *c;
+
+ mutex_lock(&ocfs2_control_lock);
+
+ c = ocfs2_connection_find(uuid);
+ if (c) {
+ BUG_ON(c->oc_conn == NULL);
+ c->oc_conn->cc_recovery_handler(nodenum,
+ c->oc_conn->cc_recovery_data);
+ }
+
+ mutex_unlock(&ocfs2_control_lock);
+}
+
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values. If there is a problem, return an error. Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+ int rc = 0;
+ int set_p = 1;
+ struct ocfs2_control_private *p = file->private_data;
+
+ BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+ mutex_lock(&ocfs2_control_lock);
+
+ if (p->op_this_node < 0) {
+ set_p = 0;
+ } else if ((ocfs2_control_this_node >= 0) &&
+ (ocfs2_control_this_node != p->op_this_node)) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (!p->op_proto.pv_major) {
+ set_p = 0;
+ } else if (!list_empty(&ocfs2_live_connection_list) &&
+ ((running_proto.pv_major != p->op_proto.pv_major) ||
+ (running_proto.pv_minor != p->op_proto.pv_minor))) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (set_p) {
+ ocfs2_control_this_node = p->op_this_node;
+ running_proto.pv_major = p->op_proto.pv_major;
+ running_proto.pv_minor = p->op_proto.pv_minor;
+ }
+
+out_unlock:
+ mutex_unlock(&ocfs2_control_lock);
+
+ if (!rc && set_p) {
+ /* We set the global values successfully */
+ atomic_inc(&ocfs2_control_opened);
+ ocfs2_control_set_handshake_state(file,
+ OCFS2_CONTROL_HANDSHAKE_VALID);
+ }
+
+ return rc;
+}
+
+static int ocfs2_control_get_this_node(void)
+{
+ int rc;
+
+ mutex_lock(&ocfs2_control_lock);
+ if (ocfs2_control_this_node < 0)
+ rc = -EINVAL;
+ else
+ rc = ocfs2_control_this_node;
+ mutex_unlock(&ocfs2_control_lock);
+
+ return rc;
+}
+
+static int ocfs2_control_do_setnode_msg(struct file *file,
+ struct ocfs2_control_message_setn *msg)
+{
+ long nodenum;
+ char *ptr = NULL;
+ struct ocfs2_control_private *p = file->private_data;
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+ return -EINVAL;
+
+ if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ return -EINVAL;
+
+ if ((msg->space != ' ') || (msg->newline != '\n'))
+ return -EINVAL;
+ msg->space = msg->newline = '\0';
+
+ nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+ if (!ptr || *ptr)
+ return -EINVAL;
+
+ if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+ (nodenum > INT_MAX) || (nodenum < 0))
+ return -ERANGE;
+ p->op_this_node = nodenum;
+
+ return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_setversion_msg(struct file *file,
+ struct ocfs2_control_message_setv *msg)
+ {
+ long major, minor;
+ char *ptr = NULL;
+ struct ocfs2_control_private *p = file->private_data;
+ struct ocfs2_protocol_version *max =
+ &ocfs2_user_plugin.sp_max_proto;
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+ return -EINVAL;
+
+ if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ return -EINVAL;
+
+ if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+ (msg->newline != '\n'))
+ return -EINVAL;
+ msg->space1 = msg->space2 = msg->newline = '\0';
+
+ major = simple_strtol(msg->major, &ptr, 16);
+ if (!ptr || *ptr)
+ return -EINVAL;
+ minor = simple_strtol(msg->minor, &ptr, 16);
+ if (!ptr || *ptr)
+ return -EINVAL;
+
+ /*
+ * The major must be between 1 and 255, inclusive. The minor
+ * must be between 0 and 255, inclusive. The version passed in
+ * must be within the maximum version supported by the filesystem.
+ */
+ if ((major == LONG_MIN) || (major == LONG_MAX) ||
+ (major > (u8)-1) || (major < 1))
+ return -ERANGE;
+ if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+ (minor > (u8)-1) || (minor < 0))
+ return -ERANGE;
+ if ((major != max->pv_major) ||
+ (minor > max->pv_minor))
+ return -EINVAL;
+
+ p->op_proto.pv_major = major;
+ p->op_proto.pv_minor = minor;
+
+ return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_down_msg(struct file *file,
+ struct ocfs2_control_message_down *msg)
+{
+ long nodenum;
+ char *p = NULL;
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_VALID)
+ return -EINVAL;
+
+ if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ return -EINVAL;
+
+ if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+ (msg->newline != '\n'))
+ return -EINVAL;
+ msg->space1 = msg->space2 = msg->newline = '\0';
+
+ nodenum = simple_strtol(msg->nodestr, &p, 16);
+ if (!p || *p)
+ return -EINVAL;
+
+ if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+ (nodenum > INT_MAX) || (nodenum < 0))
+ return -ERANGE;
+
+ ocfs2_control_send_down(msg->uuid, nodenum);
+
+ return 0;
+}
+
+static ssize_t ocfs2_control_message(struct file *file,
+ const char __user *buf,
+ size_t count)
+{
+ ssize_t ret;
+ union ocfs2_control_message msg;
+
+ /* Try to catch padding issues */
+ WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+ (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+
+ memset(&msg, 0, sizeof(union ocfs2_control_message));
+ ret = ocfs2_control_cfu(&msg, count, buf, count);
+ if (ret)
+ goto out;
+
+ if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+ !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+ else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+ !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+ else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+ !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+ OCFS2_CONTROL_MESSAGE_OP_LEN))
+ ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+ else
+ ret = -EINVAL;
+
+out:
+ return ret ? ret : count;
+}
+
+static ssize_t ocfs2_control_write(struct file *file,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ ssize_t ret;
+
+ switch (ocfs2_control_get_handshake_state(file)) {
+ case OCFS2_CONTROL_HANDSHAKE_INVALID:
+ ret = -EINVAL;
+ break;
+
+ case OCFS2_CONTROL_HANDSHAKE_READ:
+ ret = ocfs2_control_validate_protocol(file, buf,
+ count);
+ break;
+
+ case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+ case OCFS2_CONTROL_HANDSHAKE_VALID:
+ ret = ocfs2_control_message(file, buf, count);
+ break;
+
+ default:
+ BUG();
+ ret = -EIO;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This is a naive version. If we ever have a new protocol, we'll expand
+ * it. Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+ char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ ssize_t ret;
+
+ ret = simple_read_from_buffer(buf, count, ppos,
+ OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
+
+ /* Have we read the whole protocol list? */
+ if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
+ ocfs2_control_set_handshake_state(file,
+ OCFS2_CONTROL_HANDSHAKE_READ);
+
+ return ret;
+}
+
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+ struct ocfs2_control_private *p = file->private_data;
+
+ mutex_lock(&ocfs2_control_lock);
+
+ if (ocfs2_control_get_handshake_state(file) !=
+ OCFS2_CONTROL_HANDSHAKE_VALID)
+ goto out;
+
+ if (atomic_dec_and_test(&ocfs2_control_opened)) {
+ if (!list_empty(&ocfs2_live_connection_list)) {
+ /* XXX: Do bad things! */
+ printk(KERN_ERR
+ "ocfs2: Unexpected release of ocfs2_control!\n"
+ " Loss of cluster connection requires "
+ "an emergency restart!\n");
+ emergency_restart();
+ }
+ /*
+ * Last valid close clears the node number and resets
+ * the locking protocol version
+ */
+ ocfs2_control_this_node = -1;
+ running_proto.pv_major = 0;
+ running_proto.pv_major = 0;
+ }
+
+out:
+ list_del_init(&p->op_list);
+ file->private_data = NULL;
+
+ mutex_unlock(&ocfs2_control_lock);
+
+ kfree(p);
+
+ return 0;
+}
+
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+ struct ocfs2_control_private *p;
+
+ p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ p->op_this_node = -1;
+
+ mutex_lock(&ocfs2_control_lock);
+ file->private_data = p;
+ list_add(&p->op_list, &ocfs2_control_private_list);
+ mutex_unlock(&ocfs2_control_lock);
+
+ return 0;
+}
+
+static const struct file_operations ocfs2_control_fops = {
+ .open = ocfs2_control_open,
+ .release = ocfs2_control_release,
+ .read = ocfs2_control_read,
+ .write = ocfs2_control_write,
+ .owner = THIS_MODULE,
+ .llseek = default_llseek,
+};
+
+static struct miscdevice ocfs2_control_device = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "ocfs2_control",
+ .fops = &ocfs2_control_fops,
+};
+
+static int ocfs2_control_init(void)
+{
+ int rc;
+
+ atomic_set(&ocfs2_control_opened, 0);
+
+ rc = misc_register(&ocfs2_control_device);
+ if (rc)
+ printk(KERN_ERR
+ "ocfs2: Unable to register ocfs2_control device "
+ "(errno %d)\n",
+ -rc);
+
+ return rc;
+}
+
+static void ocfs2_control_exit(void)
+{
+ int rc;
+
+ rc = misc_deregister(&ocfs2_control_device);
+ if (rc)
+ printk(KERN_ERR
+ "ocfs2: Unable to deregister ocfs2_control device "
+ "(errno %d)\n",
+ -rc);
+}
+
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+ struct ocfs2_dlm_lksb *lksb = astarg;
+ int status = lksb->lksb_fsdlm.sb_status;
+
+ /*
+ * For now we're punting on the issue of other non-standard errors
+ * where we can't tell if the unlock_ast or lock_ast should be called.
+ * The main "other error" that's possible is EINVAL which means the
+ * function was called with invalid args, which shouldn't be possible
+ * since the caller here is under our control. Other non-standard
+ * errors probably fall into the same category, or otherwise are fatal
+ * which means we can't carry on anyway.
+ */
+
+ if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+ lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
+ else
+ lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
+}
+
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+ struct ocfs2_dlm_lksb *lksb = astarg;
+
+ lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
+}
+
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen)
+{
+ int ret;
+
+ if (!lksb->lksb_fsdlm.sb_lvbptr)
+ lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+ sizeof(struct dlm_lksb);
+
+ ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
+ return ret;
+}
+
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags)
+{
+ int ret;
+
+ ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
+ return ret;
+}
+
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+ return lksb->lksb_fsdlm.sb_status;
+}
+
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+ int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
+
+ return !invalid;
+}
+
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+ if (!lksb->lksb_fsdlm.sb_lvbptr)
+ lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+ sizeof(struct dlm_lksb);
+ return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+}
+
+static int user_plock(struct ocfs2_cluster_connection *conn,
+ u64 ino,
+ struct file *file,
+ int cmd,
+ struct file_lock *fl)
+{
+ /*
+ * This more or less just demuxes the plock request into any
+ * one of three dlm calls.
+ *
+ * Internally, fs/dlm will pass these to a misc device, which
+ * a userspace daemon will read and write to.
+ *
+ * For now, cancel requests (which happen internally only),
+ * are turned into unlocks. Most of this function taken from
+ * gfs2_lock.
+ */
+
+ if (cmd == F_CANCELLK) {
+ cmd = F_SETLK;
+ fl->fl_type = F_UNLCK;
+ }
+
+ if (IS_GETLK(cmd))
+ return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+ else if (fl->fl_type == F_UNLCK)
+ return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+ else
+ return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+ struct ocfs2_protocol_version *request)
+{
+ if (existing->pv_major != request->pv_major)
+ return 1;
+
+ if (existing->pv_minor > request->pv_minor)
+ return 1;
+
+ if (existing->pv_minor < request->pv_minor)
+ request->pv_minor = existing->pv_minor;
+
+ return 0;
+}
+
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, 2);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+ dlm_lockspace_t *fsdlm;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
+
+ BUG_ON(conn == NULL);
+
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_FS, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+ if (rc)
+ goto out;
+
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
+ if (rc)
+ goto out;
+
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
+ /*
+ * running_proto must have been set before we allowed any mounts
+ * to proceed.
+ */
+ if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+ printk(KERN_ERR
+ "Unable to mount with fs locking protocol version "
+ "%u.%u because negotiated protocol is %u.%u\n",
+ conn->cc_version.pv_major, conn->cc_version.pv_minor,
+ running_proto.pv_major, running_proto.pv_minor);
+ rc = -EPROTO;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
+ }
+
+out:
+ if (rc && lc)
+ kfree(lc);
+ return rc;
+}
+
+
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
+{
+ int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
+
+ if (rc < 0)
+ return rc;
+
+ *this_node = rc;
+ return 0;
+}
+
+static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
+ .connect = user_cluster_connect,
+ .disconnect = user_cluster_disconnect,
+ .this_node = user_cluster_this_node,
+ .dlm_lock = user_dlm_lock,
+ .dlm_unlock = user_dlm_unlock,
+ .lock_status = user_dlm_lock_status,
+ .lvb_valid = user_dlm_lvb_valid,
+ .lock_lvb = user_dlm_lvb,
+ .plock = user_plock,
+ .dump_lksb = user_dlm_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin ocfs2_user_plugin = {
+ .sp_name = "user",
+ .sp_ops = &ocfs2_user_plugin_ops,
+ .sp_owner = THIS_MODULE,
+};
+
+
+static int __init ocfs2_user_plugin_init(void)
+{
+ int rc;
+
+ rc = ocfs2_control_init();
+ if (!rc) {
+ rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
+ if (rc)
+ ocfs2_control_exit();
+ }
+
+ return rc;
+}
+
+static void __exit ocfs2_user_plugin_exit(void)
+{
+ ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
+ ocfs2_control_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_user_plugin_init);
+module_exit(ocfs2_user_plugin_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 00000000000..5d965e83bd4
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,748 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007, 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+
+#include "ocfs2_fs.h"
+
+#include "stackglue.h"
+
+#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
+#define OCFS2_STACK_PLUGIN_USER "user"
+#define OCFS2_MAX_HB_CTL_PATH 256
+
+static struct ocfs2_protocol_version locking_max_version;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
+
+/*
+ * The stack currently in use. If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+ struct ocfs2_stack_plugin *p;
+
+ assert_spin_locked(&ocfs2_stack_lock);
+
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ if (!strcmp(p->sp_name, name))
+ return p;
+ }
+
+ return NULL;
+}
+
+static int ocfs2_stack_driver_request(const char *stack_name,
+ const char *plugin_name)
+{
+ int rc;
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+
+ /*
+ * If the stack passed by the filesystem isn't the selected one,
+ * we can't continue.
+ */
+ if (strcmp(stack_name, cluster_stack_name)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ if (active_stack) {
+ /*
+ * If the active stack isn't the one we want, it cannot
+ * be selected right now.
+ */
+ if (!strcmp(active_stack->sp_name, plugin_name))
+ rc = 0;
+ else
+ rc = -EBUSY;
+ goto out;
+ }
+
+ p = ocfs2_stack_lookup(plugin_name);
+ if (!p || !try_module_get(p->sp_owner)) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ active_stack = p;
+ rc = 0;
+
+out:
+ /* If we found it, pin it */
+ if (!rc)
+ active_stack->sp_count++;
+
+ spin_unlock(&ocfs2_stack_lock);
+ return rc;
+}
+
+/*
+ * This function looks up the appropriate stack and makes it active. If
+ * there is no stack, it tries to load it. It will fail if the stack still
+ * cannot be found. It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+ int rc;
+ char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+
+ /*
+ * Classic stack does not pass in a stack name. This is
+ * compatible with older tools as well.
+ */
+ if (!stack_name || !*stack_name)
+ stack_name = OCFS2_STACK_PLUGIN_O2CB;
+
+ if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+ printk(KERN_ERR
+ "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+ stack_name);
+ return -EINVAL;
+ }
+
+ /* Anything that isn't the classic stack is a user stack */
+ if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+ plugin_name = OCFS2_STACK_PLUGIN_USER;
+
+ rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+ if (rc == -ENOENT) {
+ request_module("ocfs2_stack_%s", plugin_name);
+ rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+ }
+
+ if (rc == -ENOENT) {
+ printk(KERN_ERR
+ "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+ plugin_name);
+ } else if (rc == -EBUSY) {
+ printk(KERN_ERR
+ "ocfs2: A different cluster stack is in use\n");
+ }
+
+ return rc;
+}
+
+static void ocfs2_stack_driver_put(void)
+{
+ spin_lock(&ocfs2_stack_lock);
+ BUG_ON(active_stack == NULL);
+ BUG_ON(active_stack->sp_count == 0);
+
+ active_stack->sp_count--;
+ if (!active_stack->sp_count) {
+ module_put(active_stack->sp_owner);
+ active_stack = NULL;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+}
+
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+ int rc;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (!ocfs2_stack_lookup(plugin->sp_name)) {
+ plugin->sp_count = 0;
+ plugin->sp_max_proto = locking_max_version;
+ list_add(&plugin->sp_list, &ocfs2_stack_list);
+ printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+ plugin->sp_name);
+ rc = 0;
+ } else {
+ printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+ plugin->sp_name);
+ rc = -EEXIST;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+ p = ocfs2_stack_lookup(plugin->sp_name);
+ if (p) {
+ BUG_ON(p != plugin);
+ BUG_ON(plugin == active_stack);
+ BUG_ON(plugin->sp_count != 0);
+ list_del_init(&plugin->sp_list);
+ printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+ plugin->sp_name);
+ } else {
+ printk(KERN_ERR "Stack \"%s\" is not registered\n",
+ plugin->sp_name);
+ }
+ spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
+{
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (memcmp(max_proto, &locking_max_version,
+ sizeof(struct ocfs2_protocol_version))) {
+ BUG_ON(locking_max_version.pv_major != 0);
+
+ locking_max_version = *max_proto;
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ p->sp_max_proto = locking_max_version;
+ }
+ }
+ spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
+
+
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
+ * for the ast and bast functions. They will pass the lksb to the ast
+ * and bast. The caller can wrap the lksb with their own structure to
+ * get more information.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen)
+{
+ if (!lksb->lksb_conn)
+ lksb->lksb_conn = conn;
+ else
+ BUG_ON(lksb->lksb_conn != conn);
+ return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+ name, namelen);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags)
+{
+ BUG_ON(lksb->lksb_conn == NULL);
+
+ return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
+{
+ return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+ return active_stack->sp_ops->lvb_valid(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
+
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
+{
+ return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
+{
+ active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+
+int ocfs2_stack_supports_plocks(void)
+{
+ return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+ if (active_stack->sp_ops->plock)
+ return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
+int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
+ const char *group,
+ int grouplen,
+ struct ocfs2_locking_protocol *lproto,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn)
+{
+ int rc = 0;
+ struct ocfs2_cluster_connection *new_conn;
+
+ BUG_ON(group == NULL);
+ BUG_ON(conn == NULL);
+ BUG_ON(recovery_handler == NULL);
+
+ if (grouplen > GROUP_NAME_MAX) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (memcmp(&lproto->lp_max_version, &locking_max_version,
+ sizeof(struct ocfs2_protocol_version))) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+ GFP_KERNEL);
+ if (!new_conn) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
+ new_conn->cc_namelen = grouplen;
+ if (cluster_name_len)
+ strlcpy(new_conn->cc_cluster_name, cluster_name,
+ CLUSTER_NAME_MAX + 1);
+ new_conn->cc_cluster_name_len = cluster_name_len;
+ new_conn->cc_recovery_handler = recovery_handler;
+ new_conn->cc_recovery_data = recovery_data;
+
+ new_conn->cc_proto = lproto;
+ /* Start the new connection at our maximum compatibility level */
+ new_conn->cc_version = lproto->lp_max_version;
+
+ /* This will pin the stack driver if successful */
+ rc = ocfs2_stack_driver_get(stack_name);
+ if (rc)
+ goto out_free;
+
+ rc = active_stack->sp_ops->connect(new_conn);
+ if (rc) {
+ ocfs2_stack_driver_put();
+ goto out_free;
+ }
+
+ *conn = new_conn;
+
+out_free:
+ if (rc)
+ kfree(new_conn);
+
+out:
+ return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+ int grouplen,
+ struct ocfs2_locking_protocol *lproto,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn)
+{
+ char *stack_name = NULL;
+
+ if (cluster_stack_name[0])
+ stack_name = cluster_stack_name;
+ return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+ lproto, recovery_handler, recovery_data,
+ conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
+
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+ int hangup_pending)
+{
+ int ret;
+
+ BUG_ON(conn == NULL);
+
+ ret = active_stack->sp_ops->disconnect(conn);
+
+ /* XXX Should we free it anyway? */
+ if (!ret) {
+ kfree(conn);
+ if (!hangup_pending)
+ ocfs2_stack_driver_put();
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+
+/*
+ * Leave the group for this filesystem. This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+ int ret;
+ char *argv[5], *envp[3];
+
+ argv[0] = ocfs2_hb_ctl_path;
+ argv[1] = "-K";
+ argv[2] = "-u";
+ argv[3] = (char *)group;
+ argv[4] = NULL;
+
+ /* minimal command environment taken from cpu_run_sbin_hotplug */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (ret < 0) {
+ printk(KERN_ERR
+ "ocfs2: Error %d running user helper "
+ "\"%s %s %s %s\"\n",
+ ret, argv[0], argv[1], argv[2], argv[3]);
+ }
+}
+
+/*
+ * Hangup is a required post-umount. ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount. This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does
+ * the actual work.
+ */
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+ BUG_ON(group == NULL);
+ BUG_ON(group[grouplen] != '\0');
+
+ ocfs2_leave_group(group);
+
+ /* cluster_disconnect() was called with hangup_pending==1 */
+ ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
+{
+ return active_stack->sp_ops->this_node(conn, node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+
+
+/*
+ * Sysfs bits
+ */
+
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (locking_max_version.pv_major)
+ ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+ locking_max_version.pv_major,
+ locking_max_version.pv_minor);
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+ __ATTR(max_locking_protocol, S_IRUGO,
+ ocfs2_max_locking_protocol_show, NULL);
+
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ struct ocfs2_stack_plugin *p;
+
+ spin_lock(&ocfs2_stack_lock);
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ ret = snprintf(buf, remain, "%s\n",
+ p->sp_name);
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return total;
+}
+
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+ __ATTR(loaded_cluster_plugins, S_IRUGO,
+ ocfs2_loaded_cluster_plugins_show, NULL);
+
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (active_stack) {
+ ret = snprintf(buf, PAGE_SIZE, "%s\n",
+ active_stack->sp_name);
+ if (ret == PAGE_SIZE)
+ ret = -E2BIG;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+ __ATTR(active_cluster_plugin, S_IRUGO,
+ ocfs2_active_cluster_plugin_show, NULL);
+
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t ret;
+ spin_lock(&ocfs2_stack_lock);
+ ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ size_t len = count;
+ ssize_t ret;
+
+ if (len == 0)
+ return len;
+
+ if (buf[len - 1] == '\n')
+ len--;
+
+ if ((len != OCFS2_STACK_LABEL_LEN) ||
+ (strnlen(buf, len) != len))
+ return -EINVAL;
+
+ spin_lock(&ocfs2_stack_lock);
+ if (active_stack) {
+ if (!strncmp(buf, cluster_stack_name, len))
+ ret = count;
+ else
+ ret = -EBUSY;
+ } else {
+ memcpy(cluster_stack_name, buf, len);
+ ret = count;
+ }
+ spin_unlock(&ocfs2_stack_lock);
+
+ return ret;
+}
+
+
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+ __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
+ ocfs2_cluster_stack_show,
+ ocfs2_cluster_stack_store);
+
+
+
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "1\n");
+}
+
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+ __ATTR(dlm_recover_callback_support, S_IRUGO,
+ ocfs2_dlm_recover_show, NULL);
+
+static struct attribute *ocfs2_attrs[] = {
+ &ocfs2_attr_max_locking_protocol.attr,
+ &ocfs2_attr_loaded_cluster_plugins.attr,
+ &ocfs2_attr_active_cluster_plugin.attr,
+ &ocfs2_attr_cluster_stack.attr,
+ &ocfs2_attr_dlm_recover_support.attr,
+ NULL,
+};
+
+static struct attribute_group ocfs2_attr_group = {
+ .attrs = ocfs2_attrs,
+};
+
+static struct kset *ocfs2_kset;
+
+static void ocfs2_sysfs_exit(void)
+{
+ kset_unregister(ocfs2_kset);
+}
+
+static int ocfs2_sysfs_init(void)
+{
+ int ret;
+
+ ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+ if (ret)
+ goto error;
+
+ return 0;
+
+error:
+ kset_unregister(ocfs2_kset);
+ return ret;
+}
+
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+
+#define FS_OCFS2_NM 1
+
+static struct ctl_table ocfs2_nm_table[] = {
+ {
+ .procname = "hb_ctl_path",
+ .data = ocfs2_hb_ctl_path,
+ .maxlen = OCFS2_MAX_HB_CTL_PATH,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ { }
+};
+
+static struct ctl_table ocfs2_mod_table[] = {
+ {
+ .procname = "nm",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ocfs2_nm_table
+ },
+ { }
+};
+
+static struct ctl_table ocfs2_kern_table[] = {
+ {
+ .procname = "ocfs2",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ocfs2_mod_table
+ },
+ { }
+};
+
+static struct ctl_table ocfs2_root_table[] = {
+ {
+ .procname = "fs",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ocfs2_kern_table
+ },
+ { }
+};
+
+static struct ctl_table_header *ocfs2_table_header;
+
+
+/*
+ * Initialization
+ */
+
+static int __init ocfs2_stack_glue_init(void)
+{
+ strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+
+ ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+ if (!ocfs2_table_header) {
+ printk(KERN_ERR
+ "ocfs2 stack glue: unable to register sysctl\n");
+ return -ENOMEM; /* or something. */
+ }
+
+ return ocfs2_sysfs_init();
+}
+
+static void __exit ocfs2_stack_glue_exit(void)
+{
+ memset(&locking_max_version, 0,
+ sizeof(struct ocfs2_protocol_version));
+ locking_max_version.pv_major = 0;
+ locking_max_version.pv_minor = 0;
+ ocfs2_sysfs_exit();
+ if (ocfs2_table_header)
+ unregister_sysctl_table(ocfs2_table_header);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 00000000000..66334a30cea
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,301 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
+/*
+ * dlmconstants.h does not have a LOCAL flag. We hope to remove it
+ * some day, but right now we need it. Let's fake it. This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL 0x00100000
+
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX 64
+
+/* This shadows OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX 16
+
+
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior. See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+ u8 pv_major;
+ u8 pv_minor;
+};
+
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space. This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+ struct dlm_lksb lksb;
+ char lvb[DLM_LVB_LEN];
+};
+
+/*
+ * A union of all lock status structures. We define it here so that the
+ * size of the union is known. Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+struct ocfs2_cluster_connection;
+struct ocfs2_dlm_lksb {
+ union {
+ struct dlm_lockstatus lksb_o2dlm;
+ struct dlm_lksb lksb_fsdlm;
+ struct fsdlm_lksb_plus_lvb padding;
+ };
+ struct ocfs2_cluster_connection *lksb_conn;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+ struct ocfs2_protocol_version lp_max_version;
+ void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+ void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+ void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
+};
+
+
+/*
+ * A cluster connection. Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack. ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+ char cc_name[GROUP_NAME_MAX + 1];
+ int cc_namelen;
+ char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+ int cc_cluster_name_len;
+ struct ocfs2_protocol_version cc_version;
+ struct ocfs2_locking_protocol *cc_proto;
+ void (*cc_recovery_handler)(int node_num, void *recovery_data);
+ void *cc_recovery_data;
+ void *cc_lockspace;
+ void *cc_private;
+};
+
+/*
+ * Each cluster stack implements the stack operations structure. Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+ /*
+ * The fs code calls ocfs2_cluster_connect() to attach a new
+ * filesystem to the cluster stack. The ->connect() op is passed
+ * an ocfs2_cluster_connection with the name and recovery field
+ * filled in.
+ *
+ * The stack must set up any notification mechanisms and create
+ * the filesystem lockspace in the DLM. The lockspace should be
+ * stored on cc_lockspace. Any other information can be stored on
+ * cc_private.
+ *
+ * ->connect() must not return until it is guaranteed that
+ *
+ * - Node down notifications for the filesystem will be received
+ * and passed to conn->cc_recovery_handler().
+ * - Locking requests for the filesystem will be processed.
+ */
+ int (*connect)(struct ocfs2_cluster_connection *conn);
+
+ /*
+ * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+ * no longer needs cluster services. All DLM locks have been
+ * dropped, and recovery notification is being ignored by the
+ * fs code. The stack must disengage from the DLM and discontinue
+ * recovery notification.
+ *
+ * Once ->disconnect() has returned, the connection structure will
+ * be freed. Thus, a stack must not return from ->disconnect()
+ * until it will no longer reference the conn pointer.
+ *
+ * Once this call returns, the stack glue will be dropping this
+ * connection's reference on the module.
+ */
+ int (*disconnect)(struct ocfs2_cluster_connection *conn);
+
+ /*
+ * ->this_node() returns the cluster's unique identifier for the
+ * local node.
+ */
+ int (*this_node)(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
+
+ /*
+ * Call the underlying dlm lock function. The ->dlm_lock()
+ * callback should convert the flags and mode as appropriate.
+ *
+ * ast and bast functions are not part of the call because the
+ * stack will likely want to wrap ast and bast calls before passing
+ * them to stack->sp_proto. There is no astarg. The lksb will
+ * be passed back to the ast and bast functions. The caller can
+ * use this to find their object.
+ */
+ int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+ int mode,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen);
+
+ /*
+ * Call the underlying dlm unlock function. The ->dlm_unlock()
+ * function should convert the flags as appropriate.
+ *
+ * The unlock ast is not passed, as the stack will want to wrap
+ * it before calling stack->sp_proto->lp_unlock_ast(). There is
+ * no astarg. The lksb will be passed back to the unlock ast
+ * function. The caller can use this to find their object.
+ */
+ int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags);
+
+ /*
+ * Return the status of the current lock status block. The fs
+ * code should never dereference the union. The ->lock_status()
+ * callback pulls out the stack-specific lksb, converts the status
+ * to a proper errno, and returns it.
+ */
+ int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
+
+ /*
+ * Return non-zero if the LVB is valid.
+ */
+ int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
+
+ /*
+ * Pull the lvb pointer off of the stack-specific lksb.
+ */
+ void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
+
+ /*
+ * Cluster-aware posix locks
+ *
+ * This is NULL for stacks which do not support posix locks.
+ */
+ int (*plock)(struct ocfs2_cluster_connection *conn,
+ u64 ino,
+ struct file *file,
+ int cmd,
+ struct file_lock *fl);
+
+ /*
+ * This is an optoinal debugging hook. If provided, the
+ * stack can dump debugging information about this lock.
+ */
+ void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
+};
+
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure. This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+ char *sp_name;
+ struct ocfs2_stack_operations *sp_ops;
+ struct module *sp_owner;
+
+ /* These are managed by the stackglue code. */
+ struct list_head sp_list;
+ unsigned int sp_count;
+ struct ocfs2_protocol_version sp_max_proto;
+};
+
+
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
+ const char *group,
+ int grouplen,
+ struct ocfs2_locking_protocol *lproto,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name. They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+ int grouplen,
+ struct ocfs2_locking_protocol *lproto,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+ int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
+
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+ int mode,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags,
+ void *name,
+ unsigned int namelen);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags);
+
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
+
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+ struct file *file, int cmd, struct file_lock *fl);
+
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
+
+
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+
+#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 9d91e66f51a..0cb889a17ae 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,12 +29,12 @@
#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
@@ -43,56 +43,77 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
+#define NOT_ALLOC_NEW_GROUP 0
+#define ALLOC_NEW_GROUP 0x1
+#define ALLOC_GROUPS_FROM_GLOBAL 0x2
+
+#define OCFS2_MAX_TO_STEAL 1024
+
+struct ocfs2_suballoc_result {
+ u64 sr_bg_blkno; /* The bg we allocated from. Set
+ to 0 when a block group is
+ contiguous. */
+ u64 sr_bg_stable_blkno; /*
+ * Doesn't change, always
+ * set to target block
+ * group descriptor
+ * block.
+ */
+ u64 sr_blkno; /* The first allocated block */
+ unsigned int sr_bit_offset; /* The bit in the bg */
+ unsigned int sr_bits; /* How many bits we claimed */
+};
+
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+ if (res->sr_blkno == 0)
+ return 0;
+
+ if (res->sr_bg_blkno)
+ return res->sr_bg_blkno;
+
+ return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
-static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl);
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct inode *alloc_inode,
- struct buffer_head *bh);
-
-static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac);
+ struct buffer_head *bh,
+ u64 max_block,
+ u64 *last_alloc_group,
+ int flags);
static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
- u16 *bit_off, u16 *bits_found);
+ u64 max_block,
+ struct ocfs2_suballoc_result *res);
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
- u16 *bit_off, u16 *bits_found);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+ u64 max_block,
+ struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
+ handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
-static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
-
-static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
struct buffer_head *bg_bh,
@@ -100,30 +121,43 @@ static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
u16 chain);
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
u32 wanted);
-static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
- struct inode *alloc_inode,
- struct buffer_head *alloc_bh,
- unsigned int start_bit,
- u64 bg_blkno,
- unsigned int count);
-static inline u64 ocfs2_which_suballoc_group(u64 block,
- unsigned int bit);
static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
u64 bg_blkno,
u16 bg_bit_off);
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
- u32 cluster);
static inline void ocfs2_block_to_cluster_group(struct inode *inode,
u64 data_blkno,
u64 *bg_blkno,
u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+ u32 bits_wanted, u64 max_block,
+ int flags,
+ struct ocfs2_alloc_context **ac);
+
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+{
+ struct inode *inode = ac->ac_inode;
+
+ if (inode) {
+ if (ac->ac_which != OCFS2_AC_USE_LOCAL)
+ ocfs2_inode_unlock(inode, 1);
+
+ mutex_unlock(&inode->i_mutex);
+
+ iput(inode);
+ ac->ac_inode = NULL;
+ }
+ brelse(ac->ac_bh);
+ ac->ac_bh = NULL;
+ ac->ac_resv = NULL;
+ if (ac->ac_find_loc_priv) {
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
+ }
+}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
{
- if (ac->ac_inode)
- iput(ac->ac_inode);
- if (ac->ac_bh)
- brelse(ac->ac_bh);
+ ocfs2_free_ac_resource(ac);
kfree(ac);
}
@@ -132,77 +166,223 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
}
-/* somewhat more expensive than our other checks, so use sparingly. */
-static int ocfs2_check_group_descriptor(struct super_block *sb,
- struct ocfs2_dinode *di,
- struct ocfs2_group_desc *gd)
+#define do_error(fmt, ...) \
+ do{ \
+ if (resize) \
+ mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
+ else \
+ ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+ struct buffer_head *bh,
+ int resize)
{
- unsigned int max_bits;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
- return -EIO;
+ do_error("Group descriptor #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ gd->bg_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+ do_error("Group descriptor #%llu has an invalid bg_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(gd->bg_blkno));
+ return -EINVAL;
}
+ if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+ do_error("Group descriptor #%llu has an invalid "
+ "fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(gd->bg_generation));
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+ do_error("Group descriptor #%llu has bit count %u but "
+ "claims that %u are free",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits),
+ le16_to_cpu(gd->bg_free_bits_count));
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+ do_error("Group descriptor #%llu has bit count %u but "
+ "max bitmap bits of %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits),
+ 8 * le16_to_cpu(gd->bg_size));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh,
+ int resize)
+{
+ unsigned int max_bits;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
if (di->i_blkno != gd->bg_parent_dinode) {
- ocfs2_error(sb, "Group descriptor # %llu has bad parent "
- "pointer (%llu, expected %llu)",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
- (unsigned long long)le64_to_cpu(di->i_blkno));
- return -EIO;
+ do_error("Group descriptor #%llu has bad parent "
+ "pointer (%llu, expected %llu)",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits));
- return -EIO;
+ do_error("Group descriptor #%llu has bit count of %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits));
+ return -EINVAL;
}
- if (le16_to_cpu(gd->bg_chain) >=
- le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
- ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_chain));
- return -EIO;
+ /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+ if ((le16_to_cpu(gd->bg_chain) >
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+ ((le16_to_cpu(gd->bg_chain) ==
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
+ do_error("Group descriptor #%llu has bad chain %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_chain));
+ return -EINVAL;
}
- if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
- "claims that %u are free",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- le16_to_cpu(gd->bg_free_bits_count));
- return -EIO;
- }
+ return 0;
+}
- if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
- "max bitmap bits of %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- 8 * le16_to_cpu(gd->bg_size));
- return -EIO;
+#undef do_error
+
+/*
+ * This version only prints errors. It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Checksum failed for group descriptor %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ } else
+ rc = ocfs2_validate_gd_self(sb, bh, 1);
+ if (!rc)
+ rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+ return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+ trace_ocfs2_validate_group_descriptor(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+ if (rc)
+ return rc;
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+ u64 gd_blkno, struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
+ ocfs2_validate_group_descriptor);
+ if (rc)
+ goto out;
+
+ rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+ if (rc) {
+ brelse(tmp);
+ goto out;
}
- return 0;
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!*bh)
+ *bh = tmp;
+
+out:
+ return rc;
+}
+
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_chain_list *cl,
+ u64 p_blkno, unsigned int clusters)
+{
+ struct ocfs2_extent_list *el = &bg->bg_list;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(!ocfs2_supports_discontig_bg(osb));
+ if (!el->l_next_free_rec)
+ el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+ rec->e_blkno = cpu_to_le64(p_blkno);
+ rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+ le16_to_cpu(cl->cl_bpc));
+ rec->e_leaf_clusters = cpu_to_le16(clusters);
+ le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&bg->bg_free_bits_count,
+ clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&el->l_next_free_rec, 1);
}
-static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
+static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct super_block * sb = alloc_inode->i_sb;
- mlog_entry_void();
-
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
"b_blocknr (%llu)",
@@ -212,10 +392,10 @@ static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
goto bail;
}
- status = ocfs2_journal_access(handle,
- alloc_inode,
- bg_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -224,19 +404,23 @@ static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
- bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
- bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+ osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
bg->bg_blkno = cpu_to_le64(group_blkno);
+ if (group_clusters == le16_to_cpu(cl->cl_cpg))
+ bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ else
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+ group_clusters);
+
/* set the 1st bit in the bitmap to account for the descriptor block */
ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bg_bh);
/* There is no need to zero out or otherwise initialize the
* other blocks in a group - All valid FS metadata in a block
@@ -244,7 +428,8 @@ static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
* allocation time. */
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -262,107 +447,313 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
return best;
}
-/*
- * We expect the block group allocator to already be locked.
- */
-static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
- struct inode *alloc_inode,
- struct buffer_head *bh)
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
{
- int status, credits;
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
- struct ocfs2_chain_list *cl;
- struct ocfs2_alloc_context *ac = NULL;
- struct ocfs2_journal_handle *handle = NULL;
+ int status;
u32 bit_off, num_bits;
- u16 alloc_rec;
u64 bg_blkno;
- struct buffer_head *bg_bh = NULL;
- struct ocfs2_group_desc *bg;
+ struct buffer_head *bg_bh;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
- BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+ status = ocfs2_claim_clusters(handle, ac,
+ le16_to_cpu(cl->cl_cpg), &bit_off,
+ &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
- mlog_entry_void();
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_block_group_alloc_contig(
+ (unsigned long long)bg_blkno, alloc_rec);
- handle = ocfs2_alloc_handle(osb);
- if (!handle) {
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
- cl = &fe->id2.i_chain;
- status = ocfs2_reserve_clusters(osb,
- handle,
- le16_to_cpu(cl->cl_cpg),
- &ac);
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
if (status < 0) {
+ brelse(bg_bh);
+ mlog_errno(status);
+ }
+
+bail:
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ unsigned int min_bits,
+ u32 *bit_off, u32 *num_bits)
+{
+ int status = 0;
+
+ while (min_bits) {
+ status = ocfs2_claim_clusters(handle, ac, min_bits,
+ bit_off, num_bits);
if (status != -ENOSPC)
- mlog_errno(status);
+ break;
+
+ min_bits >>= 1;
+ }
+
+ return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl,
+ unsigned int min_bits)
+{
+ int status;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+ struct ocfs2_group_desc *bg =
+ (struct ocfs2_group_desc *)bg_bh->b_data;
+ unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ u32 p_cpos, clusters;
+ u64 p_blkno;
+ struct ocfs2_extent_list *el = &bg->bg_list;
+
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
goto bail;
}
- credits = ocfs2_calc_group_alloc_credits(osb->sb,
- le16_to_cpu(cl->cl_cpg));
- handle = ocfs2_start_trans(osb, handle, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
+ while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+ le16_to_cpu(el->l_count))) {
+ if (min_bits > needed)
+ min_bits = needed;
+ status = ocfs2_block_group_claim_bits(osb, handle, ac,
+ min_bits, &p_cpos,
+ &clusters);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+ clusters);
+
+ min_bits = clusters;
+ needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ }
+
+ if (needed > 0) {
+ /*
+ * We have used up all the extent rec but can't fill up
+ * the cpg. So bail out.
+ */
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+ return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+ struct ocfs2_alloc_context *cluster_ac,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh)
+{
+ int i, ret;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ if (!bg_bh)
+ return;
+
+ bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+ el = &bg->bg_list;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+ ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+ cluster_ac->ac_bh,
+ le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(rec->e_leaf_clusters));
+ if (ret)
+ mlog_errno(ret);
+ /* Try all the clusters to free */
+ }
+
+ ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+ brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+ struct buffer_head *bg_bh = NULL;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+ if (!ocfs2_supports_discontig_bg(osb)) {
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ status = ocfs2_extend_trans(handle,
+ ocfs2_calc_bg_discontig_credits(osb->sb));
+ if (status) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_claim_clusters(osb,
- handle,
- ac,
- le16_to_cpu(cl->cl_cpg),
- &bit_off,
- &num_bits);
+ /*
+ * We're going to be grabbing from multiple cluster groups.
+ * We don't have enough credits to relink them all, and the
+ * cluster groups will be staying in cache for the duration of
+ * this operation.
+ */
+ ac->ac_disable_chain_relink = 1;
+
+ /* Claim the first region */
+ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+ &bit_off, &num_bits);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
-
- alloc_rec = ocfs2_find_smallest_chain(cl);
+ min_bits = num_bits;
/* setup the group */
bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "new descriptor, record %u, at block %llu\n",
- alloc_rec, (unsigned long long)bg_blkno);
+ trace_ocfs2_block_group_alloc_discontig(
+ (unsigned long long)bg_blkno, alloc_rec);
bg_bh = sb_getblk(osb->sb, bg_blkno);
if (!bg_bh) {
- status = -EIO;
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
- status = ocfs2_block_group_fill(handle,
- alloc_inode,
- bg_bh,
- bg_blkno,
- alloc_rec,
- cl);
+ status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+ bg_bh, ac, cl, min_bits);
+ if (status)
+ mlog_errno(status);
+
+bail:
+ if (status)
+ ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
+/*
+ * We expect the block group allocator to already be locked.
+ */
+static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
+ struct inode *alloc_inode,
+ struct buffer_head *bh,
+ u64 max_block,
+ u64 *last_alloc_group,
+ int flags)
+{
+ int status, credits;
+ struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_alloc_context *ac = NULL;
+ handle_t *handle = NULL;
+ u16 alloc_rec;
+ struct buffer_head *bg_bh = NULL;
+ struct ocfs2_group_desc *bg;
+
+ BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
+
+ cl = &fe->id2.i_chain;
+ status = ocfs2_reserve_clusters_with_limit(osb,
+ le16_to_cpu(cl->cl_cpg),
+ max_block, flags, &ac);
if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ credits = ocfs2_calc_group_alloc_credits(osb->sb,
+ le16_to_cpu(cl->cl_cpg));
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
mlog_errno(status);
goto bail;
}
+ if (last_alloc_group && *last_alloc_group != 0) {
+ trace_ocfs2_block_group_alloc(
+ (unsigned long long)*last_alloc_group);
+ ac->ac_last_group = *last_alloc_group;
+ }
+
+ bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+ bg_bh = ocfs2_block_group_alloc_discontig(handle,
+ alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh)) {
+ status = PTR_ERR(bg_bh);
+ bg_bh = NULL;
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- status = ocfs2_journal_access(handle, alloc_inode,
- bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ alloc_rec = le16_to_cpu(bg->bg_chain);
le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
le16_to_cpu(bg->bg_free_bits_count));
- le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
- cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
+ le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+ le16_to_cpu(bg->bg_bits));
+ cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -371,11 +762,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -383,52 +770,69 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_to_cpu(fe->i_clusters)));
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
- alloc_inode->i_blocks =
- ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+ alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+ ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
status = 0;
+
+ /* save the new last alloc group so that the caller can cache it. */
+ if (last_alloc_group)
+ *last_alloc_group = ac->ac_last_group;
+
bail:
if (handle)
- ocfs2_commit_trans(handle);
+ ocfs2_commit_trans(osb, handle);
if (ac)
ocfs2_free_alloc_context(ac);
- if (bg_bh)
- brelse(bg_bh);
+ brelse(bg_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac)
+ struct ocfs2_alloc_context *ac,
+ int type,
+ u32 slot,
+ u64 *last_alloc_group,
+ int flags)
{
int status;
u32 bits_wanted = ac->ac_bits_wanted;
- struct inode *alloc_inode = ac->ac_inode;
+ struct inode *alloc_inode;
struct buffer_head *bh = NULL;
- struct ocfs2_journal_handle *handle = ac->ac_handle;
struct ocfs2_dinode *fe;
u32 free_bits;
- mlog_entry_void();
+ alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
+ if (!alloc_inode) {
+ mlog_errno(-EINVAL);
+ return -EINVAL;
+ }
- BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
+ mutex_lock(&alloc_inode->i_mutex);
- ocfs2_handle_add_inode(handle, alloc_inode);
- status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
+ status = ocfs2_inode_lock(alloc_inode, &bh, 1);
if (status < 0) {
+ mutex_unlock(&alloc_inode->i_mutex);
+ iput(alloc_inode);
+
mlog_errno(status);
- goto bail;
+ return status;
}
+ ac->ac_inode = alloc_inode;
+ ac->ac_alloc_slot = slot;
+
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ /* The bh was validated by the inode read inside
+ * ocfs2_inode_lock(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
(unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -442,13 +846,22 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
if (bits_wanted > free_bits) {
/* cluster bitmap never grows */
if (ocfs2_is_cluster_bitmap(alloc_inode)) {
- mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
- bits_wanted, free_bits);
+ trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+ free_bits);
status = -ENOSPC;
goto bail;
}
- status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+ if (!(flags & ALLOC_NEW_GROUP)) {
+ trace_ocfs2_reserve_suballoc_bits_no_new_group(
+ slot, bits_wanted, free_bits);
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+ ac->ac_max_block,
+ last_alloc_group, flags);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -465,51 +878,158 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
get_bh(bh);
ac->ac_bh = bh;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct ocfs2_dinode *fe,
- struct ocfs2_alloc_context **ac)
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+ spin_lock(&osb->osb_lock);
+ osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+ spin_lock(&osb->osb_lock);
+ osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ atomic_set(&osb->s_num_meta_stolen, 0);
+}
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+ ocfs2_init_inode_steal_slot(osb);
+ ocfs2_init_meta_steal_slot(osb);
+}
+
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+ spin_lock(&osb->osb_lock);
+ if (type == INODE_ALLOC_SYSTEM_INODE)
+ osb->s_inode_steal_slot = slot;
+ else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+ osb->s_meta_steal_slot = slot;
+ spin_unlock(&osb->osb_lock);
+}
+
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+ int slot = OCFS2_INVALID_SLOT;
+
+ spin_lock(&osb->osb_lock);
+ if (type == INODE_ALLOC_SYSTEM_INODE)
+ slot = osb->s_inode_steal_slot;
+ else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+ slot = osb->s_meta_steal_slot;
+ spin_unlock(&osb->osb_lock);
+
+ return slot;
+}
+
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+ return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+ return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac,
+ int type)
+{
+ int i, status = -ENOSPC;
+ int slot = __ocfs2_get_steal_slot(osb, type);
+
+ /* Start to steal resource from the first slot after ours. */
+ if (slot == OCFS2_INVALID_SLOT)
+ slot = osb->slot_num + 1;
+
+ for (i = 0; i < osb->max_slots; i++, slot++) {
+ if (slot == osb->max_slots)
+ slot = 0;
+
+ if (slot == osb->slot_num)
+ continue;
+
+ status = ocfs2_reserve_suballoc_bits(osb, ac,
+ type,
+ (u32)slot, NULL,
+ NOT_ALLOC_NEW_GROUP);
+ if (status >= 0) {
+ __ocfs2_set_steal_slot(osb, slot, type);
+ break;
+ }
+
+ ocfs2_free_ac_resource(ac);
+ }
+
+ return status;
+}
+
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac)
+{
+ return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac)
+{
+ return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+ int blocks,
+ struct ocfs2_alloc_context **ac)
{
int status;
- struct inode *alloc_inode = NULL;
+ int slot = ocfs2_get_meta_steal_slot(osb);
- *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+ *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
- (*ac)->ac_handle = handle;
+ (*ac)->ac_bits_wanted = blocks;
(*ac)->ac_which = OCFS2_AC_USE_META;
+ (*ac)->ac_group_search = ocfs2_block_group_search;
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
- alloc_inode = ocfs2_get_system_file_inode(osb,
- EXTENT_ALLOC_SYSTEM_INODE,
- 0);
-#else
- alloc_inode = ocfs2_get_system_file_inode(osb,
- EXTENT_ALLOC_SYSTEM_INODE,
- osb->slot_num);
-#endif
- if (!alloc_inode) {
- status = -ENOMEM;
+ if (slot != OCFS2_INVALID_SLOT &&
+ atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+ goto extent_steal;
+
+ atomic_set(&osb->s_num_meta_stolen, 0);
+ status = ocfs2_reserve_suballoc_bits(osb, (*ac),
+ EXTENT_ALLOC_SYSTEM_INODE,
+ (u32)osb->slot_num, NULL,
+ ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
+
+
+ if (status >= 0) {
+ status = 0;
+ if (slot != OCFS2_INVALID_SLOT)
+ ocfs2_init_meta_steal_slot(osb);
+ goto bail;
+ } else if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
}
- (*ac)->ac_inode = igrab(alloc_inode);
- (*ac)->ac_group_search = ocfs2_block_group_search;
+ ocfs2_free_ac_resource(*ac);
- status = ocfs2_reserve_suballoc_bits(osb, (*ac));
+extent_steal:
+ status = ocfs2_steal_meta(osb, *ac);
+ atomic_inc(&osb->s_num_meta_stolen);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -523,21 +1043,28 @@ bail:
*ac = NULL;
}
- if (alloc_inode)
- iput(alloc_inode);
-
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+ struct ocfs2_extent_list *root_el,
+ struct ocfs2_alloc_context **ac)
+{
+ return ocfs2_reserve_new_metadata_blocks(osb,
+ ocfs2_extend_meta_needed(root_el),
+ ac);
+}
+
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac)
{
int status;
- struct inode *alloc_inode = NULL;
+ int slot = ocfs2_get_inode_steal_slot(osb);
+ u64 alloc_group;
- *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+ *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
mlog_errno(status);
@@ -545,22 +1072,65 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
}
(*ac)->ac_bits_wanted = 1;
- (*ac)->ac_handle = handle;
(*ac)->ac_which = OCFS2_AC_USE_INODE;
- alloc_inode = ocfs2_get_system_file_inode(osb,
- INODE_ALLOC_SYSTEM_INODE,
- osb->slot_num);
- if (!alloc_inode) {
- status = -ENOMEM;
+ (*ac)->ac_group_search = ocfs2_block_group_search;
+
+ /*
+ * stat(2) can't handle i_ino > 32bits, so we tell the
+ * lower levels not to allocate us a block group past that
+ * limit. The 'inode64' mount option avoids this behavior.
+ */
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+ (*ac)->ac_max_block = (u32)~0U;
+
+ /*
+ * slot is set when we successfully steal inode from other nodes.
+ * It is reset in 3 places:
+ * 1. when we flush the truncate log
+ * 2. when we complete local alloc recovery.
+ * 3. when we successfully allocate from our own slot.
+ * After it is set, we will go on stealing inodes until we find the
+ * need to check our slots to see whether there is some space for us.
+ */
+ if (slot != OCFS2_INVALID_SLOT &&
+ atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
+ goto inode_steal;
+
+ atomic_set(&osb->s_num_inodes_stolen, 0);
+ alloc_group = osb->osb_inode_alloc_group;
+ status = ocfs2_reserve_suballoc_bits(osb, *ac,
+ INODE_ALLOC_SYSTEM_INODE,
+ (u32)osb->slot_num,
+ &alloc_group,
+ ALLOC_NEW_GROUP |
+ ALLOC_GROUPS_FROM_GLOBAL);
+ if (status >= 0) {
+ status = 0;
+
+ spin_lock(&osb->osb_lock);
+ osb->osb_inode_alloc_group = alloc_group;
+ spin_unlock(&osb->osb_lock);
+ trace_ocfs2_reserve_new_inode_new_group(
+ (unsigned long long)alloc_group);
+
+ /*
+ * Some inodes must be freed by us, so try to allocate
+ * from our own next time.
+ */
+ if (slot != OCFS2_INVALID_SLOT)
+ ocfs2_init_inode_steal_slot(osb);
+ goto bail;
+ } else if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
}
- (*ac)->ac_inode = igrab(alloc_inode);
- (*ac)->ac_group_search = ocfs2_block_group_search;
+ ocfs2_free_ac_resource(*ac);
- status = ocfs2_reserve_suballoc_bits(osb, *ac);
+inode_steal:
+ status = ocfs2_steal_inode(osb, *ac);
+ atomic_inc(&osb->s_num_inodes_stolen);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -574,10 +1144,8 @@ bail:
*ac = NULL;
}
- if (alloc_inode)
- iput(alloc_inode);
-
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -588,20 +1156,18 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
{
int status;
- ac->ac_inode = ocfs2_get_system_file_inode(osb,
- GLOBAL_BITMAP_SYSTEM_INODE,
- OCFS2_INVALID_SLOT);
- if (!ac->ac_inode) {
- status = -EINVAL;
- mlog(ML_ERROR, "Could not get bitmap inode!\n");
- goto bail;
- }
ac->ac_which = OCFS2_AC_USE_MAIN;
ac->ac_group_search = ocfs2_cluster_group_search;
- status = ocfs2_reserve_suballoc_bits(osb, ac);
- if (status < 0 && status != -ENOSPC)
+ status = ocfs2_reserve_suballoc_bits(osb, ac,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT, NULL,
+ ALLOC_NEW_GROUP);
+ if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
+ goto bail;
+ }
+
bail:
return status;
}
@@ -609,18 +1175,14 @@ bail:
/* Callers don't need to care which bitmap (local alloc or main) to
* use so we figure it out for them, but unfortunately this clutters
* things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- u32 bits_wanted,
- struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+ u32 bits_wanted, u64 max_block,
+ int flags,
+ struct ocfs2_alloc_context **ac)
{
int status;
- mlog_entry_void();
-
- BUG_ON(!handle);
-
- *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
+ *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
mlog_errno(status);
@@ -628,26 +1190,17 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
}
(*ac)->ac_bits_wanted = bits_wanted;
- (*ac)->ac_handle = handle;
+ (*ac)->ac_max_block = max_block;
status = -ENOSPC;
- if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+ if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+ ocfs2_alloc_should_use_local(osb, bits_wanted)) {
status = ocfs2_reserve_local_alloc_bits(osb,
- handle,
bits_wanted,
*ac);
if ((status < 0) && (status != -ENOSPC)) {
mlog_errno(status);
goto bail;
- } else if (status == -ENOSPC) {
- /* reserve_local_bits will return enospc with
- * the local alloc inode still locked, so we
- * can change this safely here. */
- mlog(0, "Disabling local alloc\n");
- /* We set to OCFS2_LA_DISABLED so that umount
- * can clean up what's left of the local
- * allocation */
- osb->local_alloc_state = OCFS2_LA_DISABLED;
}
}
@@ -667,10 +1220,19 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+ u32 bits_wanted,
+ struct ocfs2_alloc_context **ac)
+{
+ return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+ ALLOC_NEW_GROUP, ac);
+}
+
/*
* More or less lifted from ext3. I'll leave their description below:
*
@@ -690,39 +1252,46 @@ bail:
* sync-data inodes."
*
* Note: OCFS2 already does this differently for metadata vs data
- * allocations, as those bitmaps are seperate and undo access is never
+ * allocations, as those bitmaps are separate and undo access is never
* called on a metadata group descriptor.
*/
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr)
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
- if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+
+ if (!buffer_jbd(bg_bh))
return 1;
+ jbd_lock_bh_state(bg_bh);
bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
- return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ jbd_unlock_bh_state(bg_bh);
+
+ return ret;
}
static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
struct buffer_head *bg_bh,
unsigned int bits_wanted,
unsigned int total_bits,
- u16 *bit_off,
- u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
void *bitmap;
u16 best_offset, best_size;
int offset, start, found, status = 0;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
- return -EIO;
- }
+ /* Callers got this descriptor from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
found = start = best_offset = best_size = 0;
bitmap = bg->bg_bitmap;
@@ -757,14 +1326,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
}
}
- /* XXX: I think the first clause is equivalent to the second
- * - jlbec */
- if (found == bits_wanted) {
- *bit_off = start - found;
- *bits_found = found;
- } else if (best_size) {
- *bit_off = best_offset;
- *bits_found = best_size;
+ if (best_size) {
+ res->sr_bit_offset = best_offset;
+ res->sr_bits = best_size;
} else {
status = -ENOSPC;
/* No error log here -- see the comment above
@@ -774,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
return status;
}
-static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
@@ -785,44 +1349,40 @@ static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle
void *bitmap = bg->bg_bitmap;
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
- mlog_entry_void();
-
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto bail;
- }
+ /* All callers get the descriptor via
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
- mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
- num_bits);
+ trace_ocfs2_block_group_set_bits(bit_off, num_bits);
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
- status = ocfs2_journal_access(handle,
- alloc_inode,
- group_bh,
- journal_type);
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ group_bh,
+ journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
- status = ocfs2_journal_dirty(handle,
- group_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, group_bh);
bail:
- mlog_exit(status);
return status;
}
@@ -845,7 +1405,7 @@ static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
return best;
}
-static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
+static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
struct buffer_head *bg_bh,
@@ -855,91 +1415,59 @@ static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
int status;
/* there is a really tiny chance the journal calls could fail,
* but we wouldn't want inconsistent blocks in *any* case. */
- u64 fe_ptr, bg_ptr, prev_bg_ptr;
+ u64 bg_ptr, prev_bg_ptr;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto out;
- }
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto out;
- }
- if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
- status = -EIO;
- goto out;
- }
+ /* The caller got these descriptors from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
- mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
- (unsigned long long)fe->i_blkno, chain,
- (unsigned long long)bg->bg_blkno,
- (unsigned long long)prev_bg->bg_blkno);
+ trace_ocfs2_relink_block_group(
+ (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
- fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
- status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ prev_bg_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0)
+ goto out;
prev_bg->bg_next_group = bg->bg_next_group;
+ ocfs2_journal_dirty(handle, prev_bg_bh);
- status = ocfs2_journal_dirty(handle, prev_bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
-
- status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0)
+ goto out_rollback_prev_bg;
bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+ ocfs2_journal_dirty(handle, bg_bh);
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
-
- status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0)
+ goto out_rollback_bg;
fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+ ocfs2_journal_dirty(handle, fe_bh);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
-
- status = 0;
-out_rollback:
- if (status < 0) {
- fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
- bg->bg_next_group = cpu_to_le64(bg_ptr);
- prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
- }
out:
- mlog_exit(status);
+ if (status < 0)
+ mlog_errno(status);
return status;
+
+out_rollback_bg:
+ bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+ prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+ goto out;
}
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
@@ -953,12 +1481,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
- u16 *bit_off, u16 *bits_found)
+ u64 max_block,
+ struct ocfs2_suballoc_result *res)
{
int search = -ENOSPC;
int ret;
+ u64 blkoff;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
- u16 tmp_off, tmp_found;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
unsigned int max_bits, gd_cluster_off;
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -967,7 +1497,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
max_bits = le16_to_cpu(gd->bg_bits);
/* Tail groups in cluster bitmaps which aren't cpg
- * aligned are prone to partial extention by a failed
+ * aligned are prone to partial extension by a failed
* fs resize. If the file system resize never got to
* update the dinode cluster count, then we don't want
* to trust any clusters past it, regardless of what
@@ -977,27 +1507,42 @@ static int ocfs2_cluster_group_search(struct inode *inode,
if ((gd_cluster_off + max_bits) >
OCFS2_I(inode)->ip_clusters) {
max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
- mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- OCFS2_I(inode)->ip_clusters, max_bits);
+ trace_ocfs2_cluster_group_search_wrong_max_bits(
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ OCFS2_I(inode)->ip_clusters, max_bits);
}
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
- max_bits,
- &tmp_off, &tmp_found);
+ max_bits, res);
if (ret)
return ret;
+ if (max_block) {
+ blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+ gd_cluster_off +
+ res->sr_bit_offset +
+ res->sr_bits);
+ trace_ocfs2_cluster_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ return -ENOSPC;
+ }
+
/* ocfs2_block_group_find_clear_bits() might
* return success, but we still want to return
* -ENOSPC unless it found the minimum number
* of bits. */
- if (min_bits <= tmp_found) {
- *bit_off = tmp_off;
- *bits_found = tmp_found;
+ if (min_bits <= res->sr_bits)
search = 0; /* success */
+ else if (res->sr_bits) {
+ /*
+ * Don't show bits which we'll be returning
+ * for allocation to the local alloc bitmap.
+ */
+ ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
}
}
@@ -1007,25 +1552,37 @@ static int ocfs2_cluster_group_search(struct inode *inode,
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
- u16 *bit_off, u16 *bits_found)
+ u64 max_block,
+ struct ocfs2_suballoc_result *res)
{
int ret = -ENOSPC;
+ u64 blkoff;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
BUG_ON(min_bits != 1);
BUG_ON(ocfs2_is_cluster_bitmap(inode));
- if (bg->bg_free_bits_count)
+ if (bg->bg_free_bits_count) {
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
- bit_off, bits_found);
+ res);
+ if (!ret && max_block) {
+ blkoff = le64_to_cpu(bg->bg_blkno) +
+ res->sr_bit_offset + res->sr_bits;
+ trace_ocfs2_block_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ ret = -ENOSPC;
+ }
+ }
return ret;
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
- struct ocfs2_journal_handle *handle,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+ handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
u16 chain)
@@ -1035,8 +1592,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1045,56 +1602,119 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out:
return ret;
}
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain)
+{
+ u32 tmp_used;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ struct ocfs2_chain_list *cl;
+
+ cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+ tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+ di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
+
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_chain_list *cl)
+{
+ unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+ unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+ unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
+
+ if (res->sr_bit_offset < bitoff)
+ return 0;
+ if (res->sr_bit_offset >= (bitoff + bitcount))
+ return 0;
+ res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+ (res->sr_bit_offset - bitoff);
+ if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+ res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+ return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_suballoc_result *res)
+{
+ int i;
+ u64 bg_blkno = res->sr_bg_blkno; /* Save off */
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+ struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+ res->sr_blkno = 0;
+ return;
+ }
+
+ res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+ res->sr_bg_blkno = 0; /* Clear it for contig block groups */
+ if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+ !bg->bg_list.l_next_free_rec)
+ return;
+
+ for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+ rec = &bg->bg_list.l_recs[i];
+ if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+ res->sr_bg_blkno = bg_blkno; /* Restore */
+ break;
+ }
+ }
+}
+
static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
+ handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 gd_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int ret;
- u16 found;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *gd;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
struct inode *alloc_inode = ac->ac_inode;
- struct ocfs2_journal_handle *handle = ac->ac_handle;
- ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
- &group_bh, OCFS2_BH_CACHED, alloc_inode);
+ ret = ocfs2_read_group_descriptor(alloc_inode, di,
+ res->sr_bg_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
gd = (struct ocfs2_group_desc *) group_bh->b_data;
- if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
- ret = -EIO;
- goto out;
- }
-
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
- bit_off, &found);
+ ac->ac_max_block, res);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
- *num_bits = found;
+ if (!ret)
+ ocfs2_bg_discontig_fix_result(ac, gd, res);
+
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
- *num_bits,
+ res->sr_bits,
le16_to_cpu(gd->bg_chain));
if (ret < 0) {
mlog_errno(ret);
@@ -1102,10 +1722,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
- *bit_off, *num_bits);
- if (ret < 0)
+ res->sr_bit_offset, res->sr_bits);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+ res->sr_bits,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
+out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
out:
@@ -1115,18 +1740,15 @@ out:
}
static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
+ handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int status;
- u16 chain, tmp_bits;
- u32 tmp_used;
+ u16 chain;
u64 next_group;
- struct ocfs2_journal_handle *handle = ac->ac_handle;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
struct buffer_head *prev_group_bh = NULL;
@@ -1135,53 +1757,42 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
struct ocfs2_group_desc *bg;
chain = ac->ac_chain;
- mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
- bits_wanted, chain,
- (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
+ trace_ocfs2_search_chain_begin(
+ (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+ bits_wanted, chain);
- status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
- le64_to_cpu(cl->cl_recs[chain].c_blkno),
- &group_bh, OCFS2_BH_CACHED, alloc_inode);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe,
+ le64_to_cpu(cl->cl_recs[chain].c_blkno),
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bg = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
status = -ENOSPC;
/* for now, the chain search is a bit simplistic. We just use
* the 1st group with any empty bits. */
while ((status = ac->ac_group_search(alloc_inode, group_bh,
- bits_wanted, min_bits, bit_off,
- &tmp_bits)) == -ENOSPC) {
+ bits_wanted, min_bits,
+ ac->ac_max_block,
+ res)) == -ENOSPC) {
if (!bg->bg_next_group)
break;
- if (prev_group_bh) {
- brelse(prev_group_bh);
- prev_group_bh = NULL;
- }
+ brelse(prev_group_bh);
+ prev_group_bh = NULL;
+
next_group = le64_to_cpu(bg->bg_next_group);
prev_group_bh = group_bh;
group_bh = NULL;
- status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
- next_group, &group_bh,
- OCFS2_BH_CACHED, alloc_inode);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe,
+ next_group, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bg = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
}
if (status < 0) {
if (status != -ENOSPC)
@@ -1189,12 +1800,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
goto bail;
}
- mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
- tmp_bits, (unsigned long long)bg->bg_blkno);
+ trace_ocfs2_search_chain_succ(
+ (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
+
+ res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
- *num_bits = tmp_bits;
+ BUG_ON(res->sr_bits == 0);
+ if (!status)
+ ocfs2_bg_discontig_fix_result(ac, bg, res);
- BUG_ON(*num_bits == 0);
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
/*
* Keep track of previous block descriptor read. When
@@ -1209,9 +1828,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* Do this *after* figuring out how many bits we're taking out
* of our target group.
*/
- if (ac->ac_allow_chain_relink &&
+ if (!ac->ac_disable_chain_relink &&
(prev_group_bh) &&
- (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+ (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
status = ocfs2_relink_block_group(handle, alloc_inode,
ac->ac_bh, group_bh,
prev_group_bh, chain);
@@ -1221,24 +1840,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
}
- /* Ok, claim our bits now: set the info on dinode, chainlist
- * and then the group */
- status = ocfs2_journal_access(handle,
- alloc_inode,
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
- fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
- status = ocfs2_journal_dirty(handle,
- ac->ac_bh);
- if (status < 0) {
+ status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1247,59 +1855,58 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
alloc_inode,
bg,
group_bh,
- *bit_off,
- *num_bits);
+ res->sr_bit_offset,
+ res->sr_bits);
if (status < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(status);
goto bail;
}
- mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
- (unsigned long long)fe->i_blkno);
+ trace_ocfs2_search_chain_end(
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ res->sr_bits);
- *bg_blkno = le64_to_cpu(bg->bg_blkno);
+out_loc_only:
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
- if (group_bh)
- brelse(group_bh);
- if (prev_group_bh)
- brelse(prev_group_bh);
+ brelse(group_bh);
+ brelse(prev_group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
/* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
+ handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno)
+ struct ocfs2_suballoc_result *res)
{
int status;
u16 victim, i;
u16 bits_left = 0;
- u64 hint_blkno = ac->ac_last_group;
+ u64 hint = ac->ac_last_group;
struct ocfs2_chain_list *cl;
struct ocfs2_dinode *fe;
- mlog_entry_void();
-
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
BUG_ON(!ac->ac_bh);
fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ /* The bh was validated by the inode read during
+ * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+ ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used "
"bits but only %u total.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1308,22 +1915,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
- if (hint_blkno) {
+ res->sr_bg_blkno = hint;
+ if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
- * allocation group. This helps us mantain some
+ * allocation group. This helps us maintain some
* contiguousness across allocations. */
- status = ocfs2_search_one_group(ac, bits_wanted, min_bits,
- bit_off, num_bits,
- hint_blkno, &bits_left);
- if (!status) {
- /* Be careful to update *bg_blkno here as the
- * caller is expecting it to be filled in, and
- * ocfs2_search_one_group() won't do that for
- * us. */
- *bg_blkno = hint_blkno;
+ status = ocfs2_search_one_group(ac, handle, bits_wanted,
+ min_bits, res, &bits_left);
+ if (!status)
goto set_hint;
- }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1334,25 +1935,25 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
- ac->ac_allow_chain_relink = 1;
- status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
- num_bits, bg_blkno, &bits_left);
- if (!status)
+ status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+ res, &bits_left);
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
goto set_hint;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Search of victim chain %u came up with nothing, "
- "trying all chains now.\n", victim);
+ trace_ocfs2_claim_suballoc_bits(victim);
/* If we didn't pick a good victim, then just default to
* searching each chain in order. Don't allow chain relinking
* because we only calculate enough journal credits for one
* relink per alloc. */
- ac->ac_allow_chain_relink = 0;
+ ac->ac_disable_chain_relink = 1;
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
if (i == victim)
continue;
@@ -1360,11 +1961,12 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
continue;
ac->ac_chain = i;
- status = ocfs2_search_chain(ac, bits_wanted, min_bits,
- bit_off, num_bits, bg_blkno,
- &bits_left);
- if (!status)
+ status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+ res, &bits_left);
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
break;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1379,89 +1981,260 @@ set_hint:
if (bits_left < min_bits)
ac->ac_last_group = 0;
else
- ac->ac_last_group = *bg_blkno;
+ ac->ac_last_group = hint;
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
unsigned int *num_bits,
u64 *blkno_start)
{
int status;
- u64 bg_blkno;
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
BUG_ON(!ac);
BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
- BUG_ON(ac->ac_handle != handle);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
+ handle,
bits_wanted,
1,
- suballoc_bit_start,
- num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
- ac->ac_bits_given += (*num_bits);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit_start = res.sr_bit_offset;
+ *blkno_start = res.sr_blkno;
+ ac->ac_bits_given += res.sr_bits;
+ *num_bits = res.sr_bits;
status = 0;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+ struct buffer_head *parent_di_bh,
+ struct ocfs2_alloc_context *ac)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
+ /*
+ * Try to allocate inodes from some specific group.
+ *
+ * If the parent dir has recorded the last group used in allocation,
+ * cool, use it. Otherwise if we try to allocate new inode from the
+ * same slot the parent dir belongs to, use the same chunk.
+ *
+ * We are very careful here to avoid the mistake of setting
+ * ac_last_group to a group descriptor from a different (unlocked) slot.
+ */
+ if (OCFS2_I(dir)->ip_last_used_group &&
+ OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+ ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+ else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+ if (di->i_suballoc_loc)
+ ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+ else
+ ac->ac_last_group = ocfs2_which_suballoc_group(
+ le64_to_cpu(di->i_blkno),
+ le16_to_cpu(di->i_suballoc_bit));
+ }
+}
+
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+ struct ocfs2_alloc_context *ac)
+{
+ OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+ OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
+
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_suballoc_result *res;
+
+ BUG_ON(!ac);
+ BUG_ON(ac->ac_bits_given != 0);
+ BUG_ON(ac->ac_bits_wanted != 1);
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+ res = kzalloc(sizeof(*res), GFP_NOFS);
+ if (res == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+ /*
+ * The handle started here is for chain relink. Alternatively,
+ * we could just disable relink for these calls.
+ */
+ handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * This will instruct ocfs2_claim_suballoc_bits and
+ * ocfs2_search_one_group to search but save actual allocation
+ * for later.
+ */
+ ac->ac_find_loc_only = 1;
+
+ ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac->ac_find_loc_priv = res;
+ *fe_blkno = res->sr_blkno;
+ ocfs2_update_inode_fsync_trans(handle, dir, 0);
+out:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+ if (ret)
+ kfree(res);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno)
+{
+ int ret;
+ u16 chain;
+ struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+ struct buffer_head *bg_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+ /*
+ * Since di_blkno is being passed back in, we check for any
+ * inconsistencies which may have happened between
+ * calls. These are code bugs as di_blkno is not expected to
+ * change once returned from ocfs2_find_new_inode_loc()
+ */
+ BUG_ON(res->sr_blkno != di_blkno);
+
+ ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+ res->sr_bg_stable_blkno, &bg_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ chain = le16_to_cpu(bg->bg_chain);
+
+ ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle,
+ ac->ac_inode,
+ bg,
+ bg_bh,
+ res->sr_bit_offset,
+ res->sr_bits);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+ ac->ac_bh, res->sr_bits, chain);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+ res->sr_bits);
+
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+ BUG_ON(res->sr_bits != 1);
+
+ *suballoc_loc = res->sr_bg_blkno;
+ *suballoc_bit = res->sr_bit_offset;
+ ac->ac_bits_given++;
+ ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+ brelse(bg_bh);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode(handle_t *handle,
+ struct inode *dir,
+ struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno)
{
int status;
- unsigned int num_bits;
- u64 bg_blkno;
-
- mlog_entry_void();
+ struct ocfs2_suballoc_result res;
BUG_ON(!ac);
BUG_ON(ac->ac_bits_given != 0);
BUG_ON(ac->ac_bits_wanted != 1);
BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
- BUG_ON(ac->ac_handle != handle);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+ status = ocfs2_claim_suballoc_bits(ac,
+ handle,
1,
1,
- suballoc_bit,
- &num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- BUG_ON(num_bits != 1);
+ BUG_ON(res.sr_bits != 1);
- *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit = res.sr_bit_offset;
+ *fe_blkno = res.sr_blkno;
ac->ac_bits_given++;
+ ocfs2_save_inode_ac_group(dir, ac);
status = 0;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1484,8 +2257,7 @@ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
-static inline u64 ocfs2_which_cluster_group(struct inode *inode,
- u32 cluster)
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u32 group_no;
@@ -1527,28 +2299,26 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
* contig. allocation, set to '1' to indicate we can deal with extents
* of any size.
*/
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct ocfs2_alloc_context *ac,
- u32 min_clusters,
- u32 *cluster_start,
- u32 *num_clusters)
+int __ocfs2_claim_clusters(handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 min_clusters,
+ u32 max_clusters,
+ u32 *cluster_start,
+ u32 *num_clusters)
{
int status;
- unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
- u64 bg_blkno = 0;
- u16 bg_bit_off;
+ unsigned int bits_wanted = max_clusters;
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+ struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
- mlog_entry_void();
-
- BUG_ON(!ac);
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
&& ac->ac_which != OCFS2_AC_USE_MAIN);
- BUG_ON(ac->ac_handle != handle);
if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+ WARN_ON(min_clusters > 1);
+
status = ocfs2_claim_local_alloc_bits(osb,
handle,
ac,
@@ -1561,8 +2331,9 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
if (min_clusters > (osb->bitmap_cpg - 1)) {
/* The only paths asking for contiguousness
* should know about this already. */
- mlog(ML_ERROR, "minimum allocation requested exceeds "
- "group bitmap size!");
+ mlog(ML_ERROR, "minimum allocation requested %u exceeds "
+ "group bitmap size %u!\n", min_clusters,
+ osb->bitmap_cpg);
status = -ENOSPC;
goto bail;
}
@@ -1570,19 +2341,19 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
if (bits_wanted > (osb->bitmap_cpg - 1))
bits_wanted = osb->bitmap_cpg - 1;
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
+ handle,
bits_wanted,
min_clusters,
- &bg_bit_off,
- num_clusters,
- &bg_blkno);
+ &res);
if (!status) {
+ BUG_ON(res.sr_blkno); /* cluster alloc can't set */
*cluster_start =
ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
- bg_blkno,
- bg_bit_off);
+ res.sr_bg_blkno,
+ res.sr_bit_offset);
atomic_inc(&osb->alloc_stats.bitmap_data);
+ *num_clusters = res.sr_bits;
}
}
if (status < 0) {
@@ -1594,58 +2365,82 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
ac->ac_bits_given += *num_clusters;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits)
+int ocfs2_claim_clusters(handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 min_clusters,
+ u32 *cluster_start,
+ u32 *num_clusters)
+{
+ unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
+
+ return __ocfs2_claim_clusters(handle, ac, min_clusters,
+ bits_wanted, cluster_start, num_clusters);
+}
+
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bmap))
{
int status;
unsigned int tmp;
- int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
struct ocfs2_group_desc *undo_bg = NULL;
- mlog_entry_void();
-
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto bail;
- }
-
- mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+ /* The caller got this descriptor from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+ trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
- status = ocfs2_journal_access(handle, alloc_inode, group_bh,
- journal_type);
+ BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ group_bh,
+ undo_fn ?
+ OCFS2_JOURNAL_ACCESS_UNDO :
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+ if (undo_fn) {
+ jbd_lock_bh_state(group_bh);
+ undo_bg = (struct ocfs2_group_desc *)
+ bh2jh(group_bh)->b_committed_data;
+ BUG_ON(!undo_bg);
+ }
tmp = num_bits;
while(tmp--) {
ocfs2_clear_bit((bit_off + tmp),
(unsigned long *) bg->bg_bitmap);
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- ocfs2_set_bit(bit_off + tmp,
- (unsigned long *) undo_bg->bg_bitmap);
+ if (undo_fn)
+ undo_fn(bit_off + tmp,
+ (unsigned long *) undo_bg->bg_bitmap);
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
- status = ocfs2_journal_dirty(handle, group_bh);
- if (status < 0)
- mlog_errno(status);
+ if (undo_fn)
+ jbd_unlock_bh_state(group_bh);
+
+ ocfs2_journal_dirty(handle, group_bh);
bail:
return status;
}
@@ -1653,59 +2448,55 @@ bail:
/*
* expects the suballoc inode to already be locked.
*/
-static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
- struct inode *alloc_inode,
- struct buffer_head *alloc_bh,
- unsigned int start_bit,
- u64 bg_blkno,
- unsigned int count)
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bitmap))
{
int status = 0;
u32 tmp_used;
- struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group;
- mlog_entry_void();
-
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+ /* The alloc_bh comes from ocfs2_free_dinode() or
+ * ocfs2_free_clusters(). The callers have all locked the
+ * allocator and gotten alloc_bh from the lock call. This
+ * validates the dinode buffer. Any corruption that has happened
+ * is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
- mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
- (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
- (unsigned long long)bg_blkno, start_bit);
+ trace_ocfs2_free_suballoc_bits(
+ (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+ (unsigned long long)bg_blkno,
+ start_bit, count);
- status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
- alloc_inode);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
-
group = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
+
BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
group, group_bh,
- start_bit, count);
+ start_bit, count, undo_fn);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1715,29 +2506,28 @@ static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
count);
tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
-
- status = ocfs2_journal_dirty(handle, alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, alloc_bh);
bail:
- if (group_bh)
- brelse(group_bh);
+ brelse(group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+int ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count)
{
- u64 group = block - (u64) bit;
-
- return group;
+ return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+ start_bit, bg_blkno, count, NULL);
}
-int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
struct ocfs2_dinode *di)
@@ -1746,28 +2536,19 @@ int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
u16 bit = le16_to_cpu(di->i_suballoc_bit);
u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (di->i_suballoc_loc)
+ bg_blkno = le64_to_cpu(di->i_suballoc_loc);
return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
inode_alloc_bh, bit, bg_blkno, 1);
}
-int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
- struct inode *eb_alloc_inode,
- struct buffer_head *eb_alloc_bh,
- struct ocfs2_extent_block *eb)
-{
- u64 blk = le64_to_cpu(eb->h_blkno);
- u16 bit = le16_to_cpu(eb->h_suballoc_bit);
- u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-
- return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
- bit, bg_blkno, 1);
-}
-
-int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
- struct inode *bitmap_inode,
- struct buffer_head *bitmap_bh,
- u64 start_blk,
- unsigned int num_clusters)
+static int _ocfs2_free_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bitmap))
{
int status;
u16 bg_start_bit;
@@ -1776,11 +2557,8 @@ int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
/* You can't ever have a contiguous set of clusters
* bigger than a block group bitmap so we never have to worry
- * about looping on them. */
-
- mlog_entry_void();
-
- /* This is expensive. We can safely remove once this stuff has
+ * about looping on them.
+ * This is expensive. We can safely remove once this stuff has
* gotten tested really well. */
BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
@@ -1789,21 +2567,53 @@ int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
&bg_start_bit);
- mlog(0, "want to free %u clusters starting at block %llu\n",
- num_clusters, (unsigned long long)start_blk);
- mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
- (unsigned long long)bg_blkno, bg_start_bit);
+ trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+ (unsigned long long)start_blk,
+ bg_start_bit, num_clusters);
- status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
- bg_start_bit, bg_blkno,
- num_clusters);
- if (status < 0)
+ status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+ bg_start_bit, bg_blkno,
+ num_clusters, undo_fn);
+ if (status < 0) {
mlog_errno(status);
+ goto out;
+ }
+
+ ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+ num_clusters);
- mlog_exit(status);
+out:
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_free_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+ start_blk, num_clusters,
+ _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap. We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+ start_blk, num_clusters,
+ _ocfs2_clear_bit);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
{
printk("Block Group:\n");
@@ -1850,3 +2660,258 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
(unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
}
}
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_add, u32 extents_to_split,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *meta_ac = NULL;
+ if (data_ac)
+ *data_ac = NULL;
+
+ BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Sparse allocation file systems need to be more conservative
+ * with reserving room for expansion - the actual allocation
+ * happens while we've got a journal handle open so re-taking
+ * a cluster lock (because we ran out of room for another
+ * extent) will violate ordering rules.
+ *
+ * Most of the time we'll only be seeing this 1 cluster at a time
+ * anyway.
+ *
+ * Always lock for any unwritten extents - we might want to
+ * add blocks during a split.
+ */
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+ ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (clusters_to_add == 0)
+ goto out;
+
+ ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null *data_ac.
+ */
+ }
+
+ return ret;
+}
+
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+ u16 *suballoc_slot, u64 *group_blkno,
+ u16 *suballoc_bit)
+{
+ int status;
+ struct buffer_head *inode_bh = NULL;
+ struct ocfs2_dinode *inode_fe;
+
+ trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
+
+ /* dirty read disk */
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+ if (status < 0) {
+ mlog(ML_ERROR, "read block %llu failed %d\n",
+ (unsigned long long)blkno, status);
+ goto bail;
+ }
+
+ inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+ if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+ mlog(ML_ERROR, "invalid inode %llu requested\n",
+ (unsigned long long)blkno);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
+ (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+ mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+ (unsigned long long)blkno,
+ (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (suballoc_slot)
+ *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+ if (suballoc_bit)
+ *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+ if (group_blkno)
+ *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
+
+bail:
+ brelse(inode_bh);
+
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+/*
+ * test whether bit is SET in allocator bitmap or not. on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
+ * is returned and *res is meaningless. Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+ struct inode *suballoc,
+ struct buffer_head *alloc_bh,
+ u64 group_blkno, u64 blkno,
+ u16 bit, int *res)
+{
+ struct ocfs2_dinode *alloc_di;
+ struct ocfs2_group_desc *group;
+ struct buffer_head *group_bh = NULL;
+ u64 bg_blkno;
+ int status;
+
+ trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+ (unsigned int)bit);
+
+ alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+ if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
+ mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+ (unsigned int)bit,
+ ocfs2_bits_per_group(&alloc_di->id2.i_chain));
+ status = -EINVAL;
+ goto bail;
+ }
+
+ bg_blkno = group_blkno ? group_blkno :
+ ocfs2_which_suballoc_group(blkno, bit);
+ status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
+ &group_bh);
+ if (status < 0) {
+ mlog(ML_ERROR, "read group %llu failed %d\n",
+ (unsigned long long)bg_blkno, status);
+ goto bail;
+ }
+
+ group = (struct ocfs2_group_desc *) group_bh->b_data;
+ *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+
+bail:
+ brelse(group_bh);
+
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+ int status;
+ u64 group_blkno = 0;
+ u16 suballoc_bit = 0, suballoc_slot = 0;
+ struct inode *inode_alloc_inode;
+ struct buffer_head *alloc_bh = NULL;
+
+ trace_ocfs2_test_inode_bit((unsigned long long)blkno);
+
+ status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+ &group_blkno, &suballoc_bit);
+ if (status < 0) {
+ mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+ goto bail;
+ }
+
+ inode_alloc_inode =
+ ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+ suballoc_slot);
+ if (!inode_alloc_inode) {
+ /* the error code could be inaccurate, but we are not able to
+ * get the correct one. */
+ status = -EINVAL;
+ mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+ (u32)suballoc_slot);
+ goto bail;
+ }
+
+ mutex_lock(&inode_alloc_inode->i_mutex);
+ status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+ if (status < 0) {
+ mutex_unlock(&inode_alloc_inode->i_mutex);
+ iput(inode_alloc_inode);
+ mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+ (u32)suballoc_slot, status);
+ goto bail;
+ }
+
+ status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+ group_blkno, blkno, suballoc_bit, res);
+ if (status < 0)
+ mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+
+ ocfs2_inode_unlock(inode_alloc_inode, 0);
+ mutex_unlock(&inode_alloc_inode->i_mutex);
+
+ iput(inode_alloc_inode);
+ brelse(alloc_bh);
+bail:
+ if (status)
+ mlog_errno(status);
+ return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index c787838d105..2d2501767c0 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,16 +26,19 @@
#ifndef _CHAINALLOC_H_
#define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
- u32,
- u32,
- u16 *,
- u16 *);
+ u32, /* bits_wanted */
+ u32, /* min_bits */
+ u64, /* max_block */
+ struct ocfs2_suballoc_result *);
+ /* found bits */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
struct buffer_head *ac_bh; /* file entry bh */
+ u32 ac_alloc_slot; /* which slot are we allocating from? */
u32 ac_bits_wanted;
u32 ac_bits_given;
#define OCFS2_AC_USE_LOCAL 1
@@ -43,66 +46,119 @@ struct ocfs2_alloc_context {
#define OCFS2_AC_USE_INODE 3
#define OCFS2_AC_USE_META 4
u32 ac_which;
- struct ocfs2_journal_handle *ac_handle;
/* these are used by the chain search */
u16 ac_chain;
- int ac_allow_chain_relink;
+ int ac_disable_chain_relink;
group_search_t *ac_group_search;
u64 ac_last_group;
+ u64 ac_max_block; /* Highest block number to allocate. 0 is
+ is the same as ~0 - unlimited */
+
+ int ac_find_loc_only; /* hack for reflink operation ordering */
+ struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
+ struct ocfs2_alloc_reservation *ac_resv;
};
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
{
return ac->ac_bits_wanted - ac->ac_bits_given;
}
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
- struct ocfs2_dinode *fe,
+ struct ocfs2_extent_list *root_el,
struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+ int blocks,
+ struct ocfs2_alloc_context **ac);
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_clusters(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits);
+
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
u32 *num_bits,
u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
+ struct inode *dir,
+ struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- struct ocfs2_journal_handle *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
u32 *num_clusters);
+/*
+ * Use this variant of ocfs2_claim_clusters to specify a maxiumum
+ * number of clusters smaller than the allocation reserved.
+ */
+int __ocfs2_claim_clusters(handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 min_clusters,
+ u32 max_clusters,
+ u32 *cluster_start,
+ u32 *num_clusters);
-int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
+int ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count);
+int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
struct ocfs2_dinode *di);
-int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
- struct inode *eb_alloc_inode,
- struct buffer_head *eb_alloc_bh,
- struct ocfs2_extent_block *eb);
-int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
+int ocfs2_free_clusters(handle_t *handle,
struct inode *bitmap_inode,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters);
+
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+ u64 group = block - (u64) bit;
+
+ return group;
+}
static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
u64 bg_blkno)
@@ -130,5 +186,52 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
* apis above. */
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
+
+/* given a cluster offset, calculate which block group it belongs to
+ * and return that block offset. */
+u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
+
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem. A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function. This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh. If *bh is NULL, a bh will be
+ * allocated. This is a cached read. The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+ u64 gd_blkno, struct buffer_head **bh);
+
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+ u32 clusters_to_add, u32 extents_to_split,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac);
+
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno);
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4c29cd7cc8e..ddb662b3244 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/statfs.h>
@@ -39,10 +38,14 @@
#include <linux/parser.h>
#include <linux/crc32.h>
#include <linux/debugfs.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/quotaops.h>
+#include <linux/cleancache.h>
-#include <cluster/nodemanager.h>
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -51,6 +54,8 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
#include "extent_map.h"
@@ -63,28 +68,47 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "ver.h"
-#include "vote.h"
+#include "xattr.h"
+#include "quota.h"
+#include "refcounttree.h"
+#include "suballoc.h"
#include "buffer_head_io.h"
-static kmem_cache_t *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
-kmem_cache_t *ocfs2_lock_cache = NULL;
-
-/* OCFS2 needs to schedule several differnt types of work which
+/* OCFS2 needs to schedule several different types of work which
* require cluster locking, disk I/O, recovery waits, etc. Since these
* types of work tend to be heavy we avoid using the kernel events
* workqueue and schedule on our own. */
struct workqueue_struct *ocfs2_wq = NULL;
-static struct dentry *ocfs2_debugfs_root = NULL;
+static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
+
+struct mount_options
+{
+ unsigned long commit_interval;
+ unsigned long mount_opt;
+ unsigned int atime_quantum;
+ signed short slot;
+ int localalloc_opt;
+ unsigned int resv_level;
+ int dir_resv_level;
+ char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+};
static int ocfs2_parse_options(struct super_block *sb, char *options,
- unsigned long *mount_opt, int is_remount);
+ struct mount_options *mopt,
+ int is_remount);
+static int ocfs2_check_set_options(struct super_block *sb,
+ struct mount_options *options);
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -99,36 +123,38 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
static int ocfs2_check_volume(struct ocfs2_super *osb);
static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct buffer_head *bh,
- u32 sectsize);
+ u32 sectsize,
+ struct ocfs2_blockcheck_stats *stats);
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
- int sector_size);
+ int sector_size,
+ struct ocfs2_blockcheck_stats *stats);
static int ocfs2_get_sector(struct super_block *sb,
struct buffer_head **bh,
int block,
int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
static struct inode *ocfs2_alloc_inode(struct super_block *sb);
static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
-
-static struct super_operations ocfs2_sops = {
+static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
.alloc_inode = ocfs2_alloc_inode,
.destroy_inode = ocfs2_destroy_inode,
.drop_inode = ocfs2_drop_inode,
- .clear_inode = ocfs2_clear_inode,
- .delete_inode = ocfs2_delete_inode,
+ .evict_inode = ocfs2_evict_inode,
.sync_fs = ocfs2_sync_fs,
- .write_super = ocfs2_write_super,
.put_super = ocfs2_put_super,
.remount_fs = ocfs2_remount,
+ .show_options = ocfs2_show_options,
+ .quota_read = ocfs2_quota_read,
+ .quota_write = ocfs2_quota_write,
};
enum {
@@ -139,12 +165,30 @@ enum {
Opt_nointr,
Opt_hb_none,
Opt_hb_local,
+ Opt_hb_global,
Opt_data_ordered,
Opt_data_writeback,
+ Opt_atime_quantum,
+ Opt_slot,
+ Opt_commit,
+ Opt_localalloc,
+ Opt_localflocks,
+ Opt_stack,
+ Opt_user_xattr,
+ Opt_nouser_xattr,
+ Opt_inode64,
+ Opt_acl,
+ Opt_noacl,
+ Opt_usrquota,
+ Opt_grpquota,
+ Opt_coherency_buffered,
+ Opt_coherency_full,
+ Opt_resv_level,
+ Opt_dir_resv_level,
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_barrier, "barrier=%u"},
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
@@ -152,29 +196,215 @@ static match_table_t tokens = {
{Opt_nointr, "nointr"},
{Opt_hb_none, OCFS2_HB_NONE},
{Opt_hb_local, OCFS2_HB_LOCAL},
+ {Opt_hb_global, OCFS2_HB_GLOBAL},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_atime_quantum, "atime_quantum=%u"},
+ {Opt_slot, "preferred_slot=%u"},
+ {Opt_commit, "commit=%u"},
+ {Opt_localalloc, "localalloc=%d"},
+ {Opt_localflocks, "localflocks"},
+ {Opt_stack, "cluster_stack=%s"},
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_inode64, "inode64"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_usrquota, "usrquota"},
+ {Opt_grpquota, "grpquota"},
+ {Opt_coherency_buffered, "coherency=buffered"},
+ {Opt_coherency_full, "coherency=full"},
+ {Opt_resv_level, "resv_level=%u"},
+ {Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
};
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+ struct ocfs2_cluster_connection *cconn = osb->cconn;
+ struct ocfs2_recovery_map *rm = osb->recovery_map;
+ struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
+ int i, out = 0;
+
+ out += snprintf(buf + out, len - out,
+ "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
+ "Device", osb->dev_str, osb->uuid_str,
+ osb->fs_generation, osb->vol_label);
+
+ out += snprintf(buf + out, len - out,
+ "%10s => State: %d Flags: 0x%lX\n", "Volume",
+ atomic_read(&osb->vol_state), osb->osb_flags);
+
+ out += snprintf(buf + out, len - out,
+ "%10s => Block: %lu Cluster: %d\n", "Sizes",
+ osb->sb->s_blocksize, osb->s_clustersize);
+
+ out += snprintf(buf + out, len - out,
+ "%10s => Compat: 0x%X Incompat: 0x%X "
+ "ROcompat: 0x%X\n",
+ "Features", osb->s_feature_compat,
+ osb->s_feature_incompat, osb->s_feature_ro_compat);
+
+ out += snprintf(buf + out, len - out,
+ "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
+ osb->s_mount_opt, osb->s_atime_quantum);
+
+ if (cconn) {
+ out += snprintf(buf + out, len - out,
+ "%10s => Stack: %s Name: %*s "
+ "Version: %d.%d\n", "Cluster",
+ (*osb->osb_cluster_stack == '\0' ?
+ "o2cb" : osb->osb_cluster_stack),
+ cconn->cc_namelen, cconn->cc_name,
+ cconn->cc_version.pv_major,
+ cconn->cc_version.pv_minor);
+ }
+
+ spin_lock(&osb->dc_task_lock);
+ out += snprintf(buf + out, len - out,
+ "%10s => Pid: %d Count: %lu WakeSeq: %lu "
+ "WorkSeq: %lu\n", "DownCnvt",
+ (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
+ osb->blocked_lock_count, osb->dc_wake_sequence,
+ osb->dc_work_sequence);
+ spin_unlock(&osb->dc_task_lock);
+
+ spin_lock(&osb->osb_lock);
+ out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
+ "Recovery",
+ (osb->recovery_thread_task ?
+ task_pid_nr(osb->recovery_thread_task) : -1));
+ if (rm->rm_used == 0)
+ out += snprintf(buf + out, len - out, " None\n");
+ else {
+ for (i = 0; i < rm->rm_used; i++)
+ out += snprintf(buf + out, len - out, " %d",
+ rm->rm_entries[i]);
+ out += snprintf(buf + out, len - out, "\n");
+ }
+ spin_unlock(&osb->osb_lock);
+
+ out += snprintf(buf + out, len - out,
+ "%10s => Pid: %d Interval: %lu\n", "Commit",
+ (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
+ osb->osb_commit_interval);
+
+ out += snprintf(buf + out, len - out,
+ "%10s => State: %d TxnId: %lu NumTxns: %d\n",
+ "Journal", osb->journal->j_state,
+ osb->journal->j_trans_id,
+ atomic_read(&osb->journal->j_num_trans));
+
+ out += snprintf(buf + out, len - out,
+ "%10s => GlobalAllocs: %d LocalAllocs: %d "
+ "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n",
+ "Stats",
+ atomic_read(&osb->alloc_stats.bitmap_data),
+ atomic_read(&osb->alloc_stats.local_data),
+ atomic_read(&osb->alloc_stats.bg_allocs),
+ atomic_read(&osb->alloc_stats.moves),
+ atomic_read(&osb->alloc_stats.bg_extends));
+
+ out += snprintf(buf + out, len - out,
+ "%10s => State: %u Descriptor: %llu Size: %u bits "
+ "Default: %u bits\n",
+ "LocalAlloc", osb->local_alloc_state,
+ (unsigned long long)osb->la_last_gd,
+ osb->local_alloc_bits, osb->local_alloc_default_bits);
+
+ spin_lock(&osb->osb_lock);
+ out += snprintf(buf + out, len - out,
+ "%10s => InodeSlot: %d StolenInodes: %d, "
+ "MetaSlot: %d StolenMeta: %d\n", "Steal",
+ osb->s_inode_steal_slot,
+ atomic_read(&osb->s_num_inodes_stolen),
+ osb->s_meta_steal_slot,
+ atomic_read(&osb->s_num_meta_stolen));
+ spin_unlock(&osb->osb_lock);
+
+ out += snprintf(buf + out, len - out, "OrphanScan => ");
+ out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
+ os->os_count, os->os_seqno);
+ out += snprintf(buf + out, len - out, " Last Scan: ");
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+ out += snprintf(buf + out, len - out, "Disabled\n");
+ else
+ out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+ (get_seconds() - os->os_scantime.tv_sec));
+
+ out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
+ "Slots", "Num", "RecoGen");
+ for (i = 0; i < osb->max_slots; ++i) {
+ out += snprintf(buf + out, len - out,
+ "%10s %c %3d %10d\n",
+ " ",
+ (i == osb->slot_num ? '*' : ' '),
+ i, osb->slot_recovery_generations[i]);
+ }
+
+ return out;
+}
+
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+ struct ocfs2_super *osb = inode->i_private;
+ char *buf = NULL;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ goto bail;
+
+ i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+
+ file->private_data = buf;
+
+ return 0;
+bail:
+ return -ENOMEM;
+}
+
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- if (mutex_trylock(&sb->s_lock) != 0)
- BUG();
- sb->s_dirt = 0;
+ return 0;
}
+#endif /* CONFIG_DEBUG_FS */
+
+static const struct file_operations ocfs2_osb_debug_fops = {
+ .open = ocfs2_osb_debug_open,
+ .release = ocfs2_debug_release,
+ .read = ocfs2_debug_read,
+ .llseek = generic_file_llseek,
+};
static int ocfs2_sync_fs(struct super_block *sb, int wait)
{
- int status = 0;
+ int status;
tid_t target;
struct ocfs2_super *osb = OCFS2_SB(sb);
- sb->s_dirt = 0;
-
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
@@ -186,23 +416,35 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
ocfs2_schedule_truncate_log_flush(osb, 0);
}
- if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+ if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+ &target)) {
if (wait)
- log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
- target);
+ jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+ target);
}
return 0;
}
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ && (ino == USER_QUOTA_SYSTEM_INODE
+ || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+ return 0;
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+ && (ino == GROUP_QUOTA_SYSTEM_INODE
+ || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+ return 0;
+ return 1;
+}
+
static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
{
struct inode *new = NULL;
int status = 0;
int i;
- mlog_entry_void();
-
- new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
+ new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
@@ -210,7 +452,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
}
osb->root_inode = new;
- new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
+ new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
mlog_errno(status);
@@ -220,6 +462,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+ if (!ocfs2_need_system_inode(osb, i))
+ continue;
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
@@ -235,7 +479,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -245,11 +490,11 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
int status = 0;
int i;
- mlog_entry_void();
-
for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
i < NUM_SYSTEM_INODES;
i++) {
+ if (!ocfs2_need_system_inode(osb, i))
+ continue;
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
@@ -263,22 +508,21 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
{
- int status = 0, i;
+ int i;
struct inode *inode;
- mlog_entry_void();
-
- for (i = 0; i < NUM_SYSTEM_INODES; i++) {
- inode = osb->system_inodes[i];
+ for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+ inode = osb->global_system_inodes[i];
if (inode) {
iput(inode);
- osb->system_inodes[i] = NULL;
+ osb->global_system_inodes[i] = NULL;
}
}
@@ -294,8 +538,18 @@ static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
osb->root_inode = NULL;
}
- mlog_exit(status);
- return status;
+ if (!osb->local_system_inodes)
+ return;
+
+ for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+ if (osb->local_system_inodes[i]) {
+ iput(osb->local_system_inodes[i]);
+ osb->local_system_inodes[i] = NULL;
+ }
+ }
+
+ kfree(osb->local_system_inodes);
+ osb->local_system_inodes = NULL;
}
/* We're allocating fs objects, use GFP_NOFS */
@@ -303,81 +557,123 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
{
struct ocfs2_inode_info *oi;
- oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
+ oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
if (!oi)
return NULL;
+ oi->i_sync_tid = 0;
+ oi->i_datasync_tid = 0;
+
+ jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
}
-/* From xfs_super.c:xfs_max_file_offset
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.
- */
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
+static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
+ unsigned int cbits)
{
- unsigned int pagefactor = 1;
- unsigned int bitshift = BITS_PER_LONG - 1;
-
- /* Figure out maximum filesize, on Linux this can depend on
- * the filesystem blocksize (on 32 bit platforms).
- * __block_prepare_write does this in an [unsigned] long...
- * page->index << (PAGE_CACHE_SHIFT - bbits)
- * So, for page sized blocks (4K on 32 bit platforms),
- * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
- * but for smaller blocksizes it is less (bbits = log2 bsize).
- * Note1: get_block_t takes a long (implicit cast from above)
- * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
- * can optionally convert the [unsigned] long from above into
- * an [unsigned] long long.
+ unsigned int bytes = 1 << cbits;
+ unsigned int trim = bytes;
+ unsigned int bitshift = 32;
+
+ /*
+ * i_size and all block offsets in ocfs2 are always 64 bits
+ * wide. i_clusters is 32 bits, in cluster-sized units. So on
+ * 64 bit platforms, cluster size will be the limiting factor.
*/
#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
- BUG_ON(sizeof(sector_t) != 8);
- pagefactor = PAGE_CACHE_SIZE;
- bitshift = BITS_PER_LONG;
+# if defined(CONFIG_LBDAF)
+ BUILD_BUG_ON(sizeof(sector_t) != 8);
+ /*
+ * We might be limited by page cache size.
+ */
+ if (bytes > PAGE_CACHE_SIZE) {
+ bytes = PAGE_CACHE_SIZE;
+ trim = 1;
+ /*
+ * Shift by 31 here so that we don't get larger than
+ * MAX_LFS_FILESIZE
+ */
+ bitshift = 31;
+ }
# else
- pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+ /*
+ * We are limited by the size of sector_t. Use block size, as
+ * that's what we expose to the VFS.
+ */
+ bytes = 1 << bbits;
+ trim = 1;
+ bitshift = 31;
# endif
#endif
- return (((unsigned long long)pagefactor) << bitshift) - 1;
+ /*
+ * Trim by a whole cluster when we can actually approach the
+ * on-disk limits. Otherwise we can overflow i_clusters when
+ * an extent start is at the max offset.
+ */
+ return (((unsigned long long)bytes) << bitshift) - trim;
}
static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
{
int incompat_features;
int ret = 0;
- unsigned long parsed_options;
+ struct mount_options parsed_options;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ u32 tmp;
+
+ sync_filesystem(sb);
- if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+ if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+ !ocfs2_check_set_options(sb, &parsed_options)) {
ret = -EINVAL;
goto out;
}
- if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
- (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
+ tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE;
+ if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
goto out;
}
if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
- (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
+ (parsed_options.mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change data mode on remount\n");
goto out;
}
+ /* Probably don't want this on remount; it might
+ * mess with other nodes */
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+ (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+ ret = -EINVAL;
+ mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+ goto out;
+ }
+
/* We're going to/from readonly mode. */
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ /* Disable quota accounting before remounting RO */
+ if (*flags & MS_RDONLY) {
+ ret = ocfs2_susp_quotas(osb, 0);
+ if (ret < 0)
+ goto out;
+ }
/* Lock here so the check of HARD_RO and the potential
* setting of SOFT_RO is atomic. */
spin_lock(&osb->osb_lock);
@@ -388,12 +684,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
}
if (*flags & MS_RDONLY) {
- mlog(0, "Going to ro mode.\n");
sb->s_flags |= MS_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
} else {
- mlog(0, "Making ro filesystem writeable.\n");
-
if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
mlog(ML_ERROR, "Cannot remount RDWR "
"filesystem due to previous errors.\n");
@@ -411,17 +704,41 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
}
+ trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
unlock_osb:
spin_unlock(&osb->osb_lock);
+ /* Enable quota accounting after remounting RW */
+ if (!ret && !(*flags & MS_RDONLY)) {
+ if (sb_any_quota_suspended(sb))
+ ret = ocfs2_susp_quotas(osb, 1);
+ else
+ ret = ocfs2_enable_quotas(osb);
+ if (ret < 0) {
+ /* Return back changes... */
+ spin_lock(&osb->osb_lock);
+ sb->s_flags |= MS_RDONLY;
+ osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+ spin_unlock(&osb->osb_lock);
+ goto out;
+ }
+ }
}
if (!ret) {
+ /* Only save off the new mount options in case of a successful
+ * remount. */
+ osb->s_mount_opt = parsed_options.mount_opt;
+ osb->s_atime_quantum = parsed_options.atime_quantum;
+ osb->preferred_slot = parsed_options.slot;
+ if (parsed_options.commit_interval)
+ osb->osb_commit_interval = parsed_options.commit_interval;
+
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
- /* Only save off the new mount options in case of a successful
- * remount. */
- osb->s_mount_opt = parsed_options;
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+ MS_POSIXACL : 0);
}
out:
return ret;
@@ -429,9 +746,10 @@ out:
static int ocfs2_sb_probe(struct super_block *sb,
struct buffer_head **bh,
- int *sector_size)
+ int *sector_size,
+ struct ocfs2_blockcheck_stats *stats)
{
- int status = 0, tmpstat;
+ int status, tmpstat;
struct ocfs1_vol_disk_hdr *hdr;
struct ocfs2_dinode *di;
int blksize;
@@ -439,7 +757,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
*bh = NULL;
/* may be > 512 */
- *sector_size = bdev_hardsect_size(sb->s_bdev);
+ *sector_size = bdev_logical_block_size(sb->s_bdev);
if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
*sector_size, OCFS2_MAX_BLOCKSIZE);
@@ -492,48 +810,241 @@ static int ocfs2_sb_probe(struct super_block *sb,
if (tmpstat < 0) {
status = tmpstat;
mlog_errno(status);
- goto bail;
+ break;
}
di = (struct ocfs2_dinode *) (*bh)->b_data;
- status = ocfs2_verify_volume(di, *bh, blksize);
- if (status >= 0)
- goto bail;
- brelse(*bh);
- *bh = NULL;
- if (status != -EAGAIN)
+ memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+ spin_lock_init(&stats->b_lock);
+ tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+ if (tmpstat < 0) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ if (tmpstat != -EAGAIN) {
+ status = tmpstat;
break;
+ }
}
bail:
return status;
}
+static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
+{
+ u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+ if (osb->s_mount_opt & hb_enabled) {
+ if (ocfs2_mount_local(osb)) {
+ mlog(ML_ERROR, "Cannot heartbeat on a locally "
+ "mounted device.\n");
+ return -EINVAL;
+ }
+ if (ocfs2_userspace_stack(osb)) {
+ mlog(ML_ERROR, "Userspace stack expected, but "
+ "o2cb heartbeat arguments passed to mount\n");
+ return -EINVAL;
+ }
+ if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+ !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+ ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+ ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+ mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!(osb->s_mount_opt & hb_enabled)) {
+ if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+ !ocfs2_userspace_stack(osb)) {
+ mlog(ML_ERROR, "Heartbeat has to be started to mount "
+ "a read-write clustered device.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk. If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+ struct mount_options *mopt)
+{
+ if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+ mlog(ML_ERROR,
+ "cluster stack passed to mount, but this filesystem "
+ "does not support it\n");
+ return -EINVAL;
+ }
+
+ if (ocfs2_userspace_stack(osb) &&
+ strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+ OCFS2_STACK_LABEL_LEN)) {
+ mlog(ML_ERROR,
+ "cluster stack passed to mount (\"%s\") does not "
+ "match the filesystem (\"%s\")\n",
+ mopt->cluster_stack,
+ osb->osb_cluster_stack);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+ int type;
+ struct super_block *sb = osb->sb;
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ int status = 0;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ if (unsuspend)
+ status = dquot_resume(sb, type);
+ else {
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ /* Cancel periodic syncing before suspending */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ status = dquot_suspend(sb, type);
+ }
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+ "remount (error = %d).\n", status);
+ return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+ struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+ struct super_block *sb = osb->sb;
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ int status;
+ int type;
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+ osb->slot_num);
+ if (!inode[type]) {
+ status = -ENOENT;
+ goto out_quota_off;
+ }
+ status = dquot_enable(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
+ if (status < 0)
+ goto out_quota_off;
+ }
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ iput(inode[type]);
+ return 0;
+out_quota_off:
+ ocfs2_disable_quotas(osb);
+ for (type = 0; type < MAXQUOTAS; type++)
+ iput(inode[type]);
+ mlog_errno(status);
+ return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+ int type;
+ struct inode *inode;
+ struct super_block *sb = osb->sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ /* We mostly ignore errors in this function because there's not much
+ * we can do when we see them */
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!sb_has_quota_loaded(sb, type))
+ continue;
+ /* Cancel periodic syncing before we grab dqonoff_mutex */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ inode = igrab(sb->s_dquot.files[type]);
+ /* Turn off quotas. This will remove all dquot structures from
+ * memory and so they will be automatically synced to global
+ * quota files */
+ dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
+ if (!inode)
+ continue;
+ iput(inode);
+ }
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
+{
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ return -EINVAL;
+
+ return dquot_enable(sb_dqopt(sb)->files[type], type,
+ format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type)
+{
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static const struct quotactl_ops ocfs2_quotactl_ops = {
+ .quota_on_meta = ocfs2_quota_on,
+ .quota_off = ocfs2_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
+};
+
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
{
struct dentry *root;
int status, sector_size;
- unsigned long parsed_opt;
+ struct mount_options parsed_options;
struct inode *inode = NULL;
struct ocfs2_super *osb = NULL;
struct buffer_head *bh = NULL;
+ char nodestr[12];
+ struct ocfs2_blockcheck_stats stats;
- mlog_entry("%p, %p, %i", sb, data, silent);
+ trace_ocfs2_fill_super(sb, data, silent);
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (!o2hb_check_local_node_heartbeating()) {
+ if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
status = -EINVAL;
goto read_super_error;
}
/* probe for superblock */
- status = ocfs2_sb_probe(sb, &bh, &sector_size);
+ status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
if (status < 0) {
mlog(ML_ERROR, "superblock probe failed!\n");
goto read_super_error;
}
- status = ocfs2_initialize_super(sb, bh, sector_size);
+ status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
osb = OCFS2_SB(sb);
if (status < 0) {
mlog_errno(status);
@@ -542,14 +1053,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
bh = NULL;
- if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+ if (!ocfs2_check_set_options(sb, &parsed_options)) {
status = -EINVAL;
goto read_super_error;
}
- osb->s_mount_opt = parsed_opt;
+ osb->s_mount_opt = parsed_options.mount_opt;
+ osb->s_atime_quantum = parsed_options.atime_quantum;
+ osb->preferred_slot = parsed_options.slot;
+ osb->osb_commit_interval = parsed_options.commit_interval;
+
+ ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+ osb->osb_resv_level = parsed_options.resv_level;
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ if (parsed_options.dir_resv_level == -1)
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ else
+ osb->osb_dir_resv_level = parsed_options.dir_resv_level;
+
+ status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+ if (status)
+ goto read_super_error;
sb->s_magic = OCFS2_SUPER_MAGIC;
+ sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
* heartbeat=none */
if (bdev_read_only(sb->s_bdev)) {
@@ -576,33 +1105,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
"file system, but write access is "
"unavailable.\n");
else
- mlog_errno(status);
+ mlog_errno(status);
goto read_super_error;
}
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
- /* If this isn't a hard readonly mount, then we need
- * to make sure that heartbeat is in a valid state,
- * and that we mark ourselves soft readonly is -oro
- * was specified. */
- if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
- mlog(ML_ERROR, "No heartbeat for device (%s)\n",
- sb->s_id);
- status = -EINVAL;
- goto read_super_error;
- }
-
if (sb->s_flags & MS_RDONLY)
ocfs2_set_ro_flag(osb, 0);
}
+ status = ocfs2_verify_heartbeat(osb);
+ if (status < 0) {
+ mlog_errno(status);
+ goto read_super_error;
+ }
+
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
if (!osb->osb_debug_root) {
@@ -611,20 +1135,42 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- status = ocfs2_mount_volume(sb);
- if (osb->root_inode)
- inode = igrab(osb->root_inode);
+ osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+ osb->osb_debug_root,
+ osb,
+ &ocfs2_osb_debug_fops);
+ if (!osb->osb_ctxt) {
+ status = -EINVAL;
+ mlog_errno(status);
+ goto read_super_error;
+ }
+ if (ocfs2_meta_ecc(osb)) {
+ status = ocfs2_blockcheck_stats_debugfs_install(
+ &osb->osb_ecc_stats,
+ osb->osb_debug_root);
+ if (status) {
+ mlog(ML_ERROR,
+ "Unable to create blockcheck statistics "
+ "files\n");
+ goto read_super_error;
+ }
+ }
+
+ status = ocfs2_mount_volume(sb);
if (status < 0)
goto read_super_error;
+ if (osb->root_inode)
+ inode = igrab(osb->root_inode);
+
if (!inode) {
status = -EIO;
mlog_errno(status);
goto read_super_error;
}
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
mlog_errno(status);
@@ -635,24 +1181,48 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_complete_mount_recovery(osb);
- printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) "
+ if (ocfs2_mount_local(osb))
+ snprintf(nodestr, sizeof(nodestr), "local");
+ else
+ snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
+
+ printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
"with %s data mode.\n",
- osb->dev_str, osb->node_num, osb->slot_num,
+ osb->dev_str, nodestr, osb->slot_num,
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
- mlog_exit(status);
+ /* Now we can initialize quotas because we can afford to wait
+ * for cluster locks recovery now. That also means that truncation
+ * log recovery can happen but that waits for proper quota setup */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ status = ocfs2_enable_quotas(osb);
+ if (status < 0) {
+ /* We have to err-out specially here because
+ * s_root is already set */
+ mlog_errno(status);
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ return status;
+ }
+ }
+
+ ocfs2_complete_quota_recovery(osb);
+
+ /* Now we wake up again for processes waiting for quotas */
+ atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+ wake_up(&osb->osb_mount_event);
+
+ /* Start this when the mount is almost sure of being successful */
+ ocfs2_orphan_scan_start(osb);
+
return status;
read_super_error:
- if (bh != NULL)
- brelse(bh);
-
- if (inode)
- iput(inode);
+ brelse(bh);
if (osb) {
atomic_set(&osb->vol_state, VOLUME_DISABLED);
@@ -660,44 +1230,82 @@ read_super_error:
ocfs2_dismount_volume(sb, 1);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
int flags,
const char *dev_name,
- void *data,
- struct vfsmount *mnt)
+ void *data)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
- mnt);
+ return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
}
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
- .get_sb = ocfs2_get_sb, /* is this called when we mount
- * the fs? */
- .kill_sb = kill_block_super, /* set to the generic one
- * right now, but do we
- * need to change that? */
+ .mount = ocfs2_mount,
+ .kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
.next = NULL
};
+MODULE_ALIAS_FS("ocfs2");
+
+static int ocfs2_check_set_options(struct super_block *sb,
+ struct mount_options *options)
+{
+ if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ mlog(ML_ERROR, "User quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ return 0;
+ }
+ if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ mlog(ML_ERROR, "Group quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ return 0;
+ }
+ if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+ !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+ mlog(ML_ERROR, "ACL support requested but extended attributes "
+ "feature is not enabled\n");
+ return 0;
+ }
+ /* No ACL setting specified? Use XATTR feature... */
+ if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+ OCFS2_MOUNT_NO_POSIX_ACL))) {
+ if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+ options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ else
+ options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+ }
+ return 1;
+}
static int ocfs2_parse_options(struct super_block *sb,
char *options,
- unsigned long *mount_opt,
+ struct mount_options *mopt,
int is_remount)
{
- int status;
+ int status, user_stack = 0;
char *p;
+ u32 tmp;
- mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
- options ? options : "(none)");
+ trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
- *mount_opt = 0;
+ mopt->commit_interval = 0;
+ mopt->mount_opt = OCFS2_MOUNT_NOINTR;
+ mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+ mopt->slot = OCFS2_INVALID_SLOT;
+ mopt->localalloc_opt = -1;
+ mopt->cluster_stack[0] = '\0';
+ mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+ mopt->dir_resv_level = -1;
if (!options) {
status = 1;
@@ -714,10 +1322,13 @@ static int ocfs2_parse_options(struct super_block *sb,
token = match_token(p, tokens, args);
switch (token) {
case Opt_hb_local:
- *mount_opt |= OCFS2_MOUNT_HB_LOCAL;
+ mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
break;
case Opt_hb_none:
- *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+ mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+ break;
+ case Opt_hb_global:
+ mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
break;
case Opt_barrier:
if (match_int(&args[0], &option)) {
@@ -725,27 +1336,158 @@ static int ocfs2_parse_options(struct super_block *sb,
goto bail;
}
if (option)
- *mount_opt |= OCFS2_MOUNT_BARRIER;
+ mopt->mount_opt |= OCFS2_MOUNT_BARRIER;
else
- *mount_opt &= ~OCFS2_MOUNT_BARRIER;
+ mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER;
break;
case Opt_intr:
- *mount_opt &= ~OCFS2_MOUNT_NOINTR;
+ mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR;
break;
case Opt_nointr:
- *mount_opt |= OCFS2_MOUNT_NOINTR;
+ mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
- *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
- *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_data_ordered:
- *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
+ mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
break;
case Opt_data_writeback:
- *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+ mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
+ break;
+ case Opt_user_xattr:
+ mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+ break;
+ case Opt_nouser_xattr:
+ mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+ break;
+ case Opt_atime_quantum:
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= 0)
+ mopt->atime_quantum = option;
+ break;
+ case Opt_slot:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option)
+ mopt->slot = (s16)option;
+ break;
+ case Opt_commit:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option < 0)
+ return 0;
+ if (option == 0)
+ option = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ mopt->commit_interval = HZ * option;
+ break;
+ case Opt_localalloc:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= 0)
+ mopt->localalloc_opt = option;
+ break;
+ case Opt_localflocks:
+ /*
+ * Changing this during remount could race
+ * flock() requests, or "unbalance" existing
+ * ones (e.g., a lock is taken in one mode but
+ * dropped in the other). If users care enough
+ * to flip locking modes during remount, we
+ * could add a "local" flag to individual
+ * flock structures for proper tracking of
+ * state.
+ */
+ if (!is_remount)
+ mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
+ break;
+ case Opt_stack:
+ /* Check both that the option we were passed
+ * is of the right length and that it is a proper
+ * string of the right length.
+ */
+ if (((args[0].to - args[0].from) !=
+ OCFS2_STACK_LABEL_LEN) ||
+ (strnlen(args[0].from,
+ OCFS2_STACK_LABEL_LEN) !=
+ OCFS2_STACK_LABEL_LEN)) {
+ mlog(ML_ERROR,
+ "Invalid cluster_stack option\n");
+ status = 0;
+ goto bail;
+ }
+ memcpy(mopt->cluster_stack, args[0].from,
+ OCFS2_STACK_LABEL_LEN);
+ mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ /*
+ * Open code the memcmp here as we don't have
+ * an osb to pass to
+ * ocfs2_userspace_stack().
+ */
+ if (memcmp(mopt->cluster_stack,
+ OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ user_stack = 1;
+ break;
+ case Opt_inode64:
+ mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+ break;
+ case Opt_usrquota:
+ mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+ break;
+ case Opt_coherency_buffered:
+ mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
+ case Opt_coherency_full:
+ mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
+ case Opt_acl:
+ mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
+ break;
+ case Opt_noacl:
+ mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+ mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+ break;
+ case Opt_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->resv_level = option;
+ break;
+ case Opt_dir_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->dir_resv_level = option;
break;
default:
mlog(ML_ERROR,
@@ -756,40 +1498,122 @@ static int ocfs2_parse_options(struct super_block *sb,
}
}
+ if (user_stack == 0) {
+ /* Ensure only one heartbeat mode */
+ tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+ OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE);
+ if (hweight32(tmp) != 1) {
+ mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+ status = 0;
+ goto bail;
+ }
+ }
+
status = 1;
bail:
- mlog_exit(status);
return status;
}
-static int __init ocfs2_init(void)
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
{
- int status;
+ struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
+ unsigned long opts = osb->s_mount_opt;
+ unsigned int local_alloc_megs;
+
+ if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+ seq_printf(s, ",_netdev");
+ if (opts & OCFS2_MOUNT_HB_LOCAL)
+ seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+ else
+ seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+ } else
+ seq_printf(s, ",%s", OCFS2_HB_NONE);
- mlog_entry_void();
+ if (opts & OCFS2_MOUNT_NOINTR)
+ seq_printf(s, ",nointr");
- ocfs2_print_version();
+ if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
+ seq_printf(s, ",data=writeback");
+ else
+ seq_printf(s, ",data=ordered");
- if (init_ocfs2_extent_maps())
- return -ENOMEM;
+ if (opts & OCFS2_MOUNT_BARRIER)
+ seq_printf(s, ",barrier=1");
+
+ if (opts & OCFS2_MOUNT_ERRORS_PANIC)
+ seq_printf(s, ",errors=panic");
+ else
+ seq_printf(s, ",errors=remount-ro");
+
+ if (osb->preferred_slot != OCFS2_INVALID_SLOT)
+ seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
+
+ seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+
+ if (osb->osb_commit_interval)
+ seq_printf(s, ",commit=%u",
+ (unsigned) (osb->osb_commit_interval / HZ));
+
+ local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+ if (local_alloc_megs != ocfs2_la_default_mb(osb))
+ seq_printf(s, ",localalloc=%d", local_alloc_megs);
+
+ if (opts & OCFS2_MOUNT_LOCALFLOCKS)
+ seq_printf(s, ",localflocks,");
+
+ if (osb->osb_cluster_stack[0])
+ seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+ osb->osb_cluster_stack);
+ if (opts & OCFS2_MOUNT_USRQUOTA)
+ seq_printf(s, ",usrquota");
+ if (opts & OCFS2_MOUNT_GRPQUOTA)
+ seq_printf(s, ",grpquota");
+
+ if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+ seq_printf(s, ",coherency=buffered");
+ else
+ seq_printf(s, ",coherency=full");
+
+ if (opts & OCFS2_MOUNT_NOUSERXATTR)
+ seq_printf(s, ",nouser_xattr");
+ else
+ seq_printf(s, ",user_xattr");
+
+ if (opts & OCFS2_MOUNT_INODE64)
+ seq_printf(s, ",inode64");
+
+ if (opts & OCFS2_MOUNT_POSIX_ACL)
+ seq_printf(s, ",acl");
+ else
+ seq_printf(s, ",noacl");
+
+ if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+ seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+ if (osb->osb_dir_resv_level != osb->osb_resv_level)
+ seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
+ return 0;
+}
+
+static int __init ocfs2_init(void)
+{
+ int status;
status = init_ocfs2_uptodate_cache();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out1;
status = ocfs2_initialize_mem_caches();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out2;
ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
if (!ocfs2_wq) {
status = -ENOMEM;
- goto leave;
+ goto out3;
}
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
@@ -798,51 +1622,52 @@ static int __init ocfs2_init(void)
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
-leave:
- if (status < 0) {
- ocfs2_free_mem_caches();
- exit_ocfs2_uptodate_cache();
- exit_ocfs2_extent_maps();
- }
+ ocfs2_set_locking_protocol();
- mlog_exit(status);
-
- if (status >= 0) {
- return register_filesystem(&ocfs2_fs_type);
- } else
- return -1;
+ status = register_quota_format(&ocfs2_quota_format);
+ if (status < 0)
+ goto out4;
+ status = register_filesystem(&ocfs2_fs_type);
+ if (!status)
+ return 0;
+
+ unregister_quota_format(&ocfs2_quota_format);
+out4:
+ destroy_workqueue(ocfs2_wq);
+ debugfs_remove(ocfs2_debugfs_root);
+out3:
+ ocfs2_free_mem_caches();
+out2:
+ exit_ocfs2_uptodate_cache();
+out1:
+ mlog_errno(status);
+ return status;
}
static void __exit ocfs2_exit(void)
{
- mlog_entry_void();
-
if (ocfs2_wq) {
flush_workqueue(ocfs2_wq);
destroy_workqueue(ocfs2_wq);
}
+ unregister_quota_format(&ocfs2_quota_format);
+
debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
unregister_filesystem(&ocfs2_fs_type);
- exit_ocfs2_extent_maps();
-
exit_ocfs2_uptodate_cache();
-
- mlog_exit_void();
}
static void ocfs2_put_super(struct super_block *sb)
{
- mlog_entry("(0x%p)\n", sb);
+ trace_ocfs2_put_super(sb);
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
-
- mlog_exit_void();
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -854,7 +1679,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct buffer_head *bh = NULL;
struct inode *inode = NULL;
- mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
+ trace_ocfs2_statfs(dentry->d_sb, buf);
osb = OCFS2_SB(dentry->d_sb);
@@ -867,7 +1692,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
goto bail;
}
- status = ocfs2_meta_lock(inode, NULL, &bh, 0);
+ status = ocfs2_inode_lock(inode, &bh, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -888,53 +1713,53 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = numbits;
buf->f_ffree = freebits;
+ buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+ & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+ OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
brelse(bh);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
if (inode)
iput(inode);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static void ocfs2_inode_init_once(void *data,
- kmem_cache_t *cachep,
- unsigned long flags)
+static void ocfs2_inode_init_once(void *data)
{
struct ocfs2_inode_info *oi = data;
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR) {
- oi->ip_flags = 0;
- oi->ip_open_count = 0;
- spin_lock_init(&oi->ip_lock);
- ocfs2_extent_map_init(&oi->vfs_inode);
- INIT_LIST_HEAD(&oi->ip_handle_list);
- INIT_LIST_HEAD(&oi->ip_io_markers);
- oi->ip_handle = NULL;
- oi->ip_created_trans = 0;
- oi->ip_last_trans = 0;
- oi->ip_dir_start_lookup = 0;
+ oi->ip_flags = 0;
+ oi->ip_open_count = 0;
+ spin_lock_init(&oi->ip_lock);
+ ocfs2_extent_map_init(&oi->vfs_inode);
+ INIT_LIST_HEAD(&oi->ip_io_markers);
+ oi->ip_dir_start_lookup = 0;
+ mutex_init(&oi->ip_unaligned_aio);
+ init_rwsem(&oi->ip_alloc_sem);
+ init_rwsem(&oi->ip_xattr_sem);
+ mutex_init(&oi->ip_io_mutex);
- init_rwsem(&oi->ip_alloc_sem);
- mutex_init(&oi->ip_io_mutex);
+ oi->ip_blkno = 0ULL;
+ oi->ip_clusters = 0;
- oi->ip_blkno = 0ULL;
- oi->ip_clusters = 0;
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
- ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
- ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
- ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- ocfs2_metadata_cache_init(&oi->vfs_inode);
+ ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
+ &ocfs2_inode_caching_ops);
- inode_init_once(&oi->vfs_inode);
- }
+ inode_init_once(&oi->vfs_inode);
}
static int ocfs2_initialize_mem_caches(void)
@@ -944,30 +1769,50 @@ static int ocfs2_initialize_mem_caches(void)
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
- ocfs2_inode_init_once, NULL);
- if (!ocfs2_inode_cachep)
- return -ENOMEM;
-
- ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
- sizeof(struct ocfs2_journal_lock),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!ocfs2_lock_cache)
+ ocfs2_inode_init_once);
+ ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+ sizeof(struct ocfs2_dquot),
+ 0,
+ (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ NULL);
+ ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+ sizeof(struct ocfs2_quota_chunk),
+ 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ NULL);
+ if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+ !ocfs2_qf_chunk_cachep) {
+ if (ocfs2_inode_cachep)
+ kmem_cache_destroy(ocfs2_inode_cachep);
+ if (ocfs2_dquot_cachep)
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ if (ocfs2_qf_chunk_cachep)
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
return -ENOMEM;
+ }
return 0;
}
static void ocfs2_free_mem_caches(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
if (ocfs2_inode_cachep)
kmem_cache_destroy(ocfs2_inode_cachep);
- if (ocfs2_lock_cache)
- kmem_cache_destroy(ocfs2_lock_cache);
-
ocfs2_inode_cachep = NULL;
- ocfs2_lock_cache = NULL;
+
+ if (ocfs2_dquot_cachep)
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ ocfs2_dquot_cachep = NULL;
+
+ if (ocfs2_qf_chunk_cachep)
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ ocfs2_qf_chunk_cachep = NULL;
}
static int ocfs2_get_sector(struct super_block *sb,
@@ -982,8 +1827,8 @@ static int ocfs2_get_sector(struct super_block *sb,
*bh = sb_getblk(sb, block);
if (!*bh) {
- mlog_errno(-EIO);
- return -EIO;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
lock_buffer(*bh);
if (!buffer_dirty(*bh))
@@ -991,28 +1836,14 @@ static int ocfs2_get_sector(struct super_block *sb,
unlock_buffer(*bh);
ll_rw_block(READ, 1, bh);
wait_on_buffer(*bh);
- return 0;
-}
-
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
- int status;
-
- /* XXX hold a ref on the node while mounte? easy enough, if
- * desirable. */
- osb->node_num = o2nm_this_node();
- if (osb->node_num == O2NM_MAX_NODES) {
- mlog(ML_ERROR, "could not find this host's node number\n");
- status = -ENOENT;
- goto bail;
+ if (!buffer_uptodate(*bh)) {
+ mlog_errno(-EIO);
+ brelse(*bh);
+ *bh = NULL;
+ return -EIO;
}
- mlog(0, "I am node %d\n", osb->node_num);
-
- status = 0;
-bail:
- return status;
+ return 0;
}
static int ocfs2_mount_volume(struct super_block *sb)
@@ -1021,36 +1852,15 @@ static int ocfs2_mount_volume(struct super_block *sb)
int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb))
goto leave;
- status = ocfs2_fill_local_node_info(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- status = ocfs2_register_hb_callbacks(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- /* requires vote_thread to be running. */
- status = ocfs2_register_net_handlers(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
@@ -1065,8 +1875,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
goto leave;
}
- ocfs2_populate_mounted_map(osb);
-
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
@@ -1081,17 +1889,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
}
status = ocfs2_truncate_log_init(osb);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- /* This should be sent *after* we recovered our journal as it
- * will cause other nodes to unmark us as needing
- * recovery. However, we need to send it *before* dropping the
- * super block lock as otherwise their recovery threads might
- * try to clean us up while we're live! */
- status = ocfs2_request_mount_vote(osb);
if (status < 0)
mlog_errno(status);
@@ -1099,88 +1896,92 @@ leave:
if (unlock_super)
ocfs2_super_unlock(osb, 1);
- mlog_exit(status);
return status;
}
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
- mb();
- return osb->recovery_thread_task != NULL;
-}
-
static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
- int tmp;
+ int tmp, hangup_needed = 0;
struct ocfs2_super *osb = NULL;
+ char nodestr[12];
- mlog_entry("(0x%p)\n", sb);
+ trace_ocfs2_dismount_volume(sb);
BUG_ON(!sb);
osb = OCFS2_SB(sb);
BUG_ON(!osb);
+ debugfs_remove(osb->osb_ctxt);
+
+ /* Orphan scan should be stopped as early as possible */
+ ocfs2_orphan_scan_stop(osb);
+
+ ocfs2_disable_quotas(osb);
+
+ /* All dquots should be freed by now */
+ WARN_ON(!llist_empty(&osb->dquot_drop_list));
+ /* Wait for worker to be done with the work structure in osb */
+ cancel_work_sync(&osb->dquot_drop_work);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
- /* disable any new recovery threads and wait for any currently
- * running ones to exit. Do this before setting the vol_state. */
- mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
- mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
- flush_workqueue(ocfs2_wq);
+ /* This will disable recovery and flush any recovery work. */
+ ocfs2_recovery_exit(osb);
ocfs2_journal_shutdown(osb);
ocfs2_sync_blockdev(sb);
- /* No dlm means we've failed during mount, so skip all the
- * steps which depended on that to complete. */
- if (osb->dlm) {
+ ocfs2_purge_refcount_trees(osb);
+
+ /* No cluster connection means we've failed during mount, so skip
+ * all the steps which depended on that to complete. */
+ if (osb->cconn) {
tmp = ocfs2_super_lock(osb, 1);
if (tmp < 0) {
mlog_errno(tmp);
return;
}
+ }
- tmp = ocfs2_request_umount_vote(osb);
- if (tmp < 0)
- mlog_errno(tmp);
-
- if (osb->slot_num != OCFS2_INVALID_SLOT)
- ocfs2_put_slot(osb);
+ if (osb->slot_num != OCFS2_INVALID_SLOT)
+ ocfs2_put_slot(osb);
+ if (osb->cconn)
ocfs2_super_unlock(osb, 1);
- }
ocfs2_release_system_inodes(osb);
- if (osb->dlm) {
- ocfs2_unregister_net_handlers(osb);
-
- ocfs2_dlm_shutdown(osb);
- }
+ /*
+ * If we're dismounting due to mount error, mount.ocfs2 will clean
+ * up heartbeat. If we're a local mount, there is no heartbeat.
+ * If we failed before we got a uuid_str yet, we can't stop
+ * heartbeat. Otherwise, do it.
+ */
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
+ hangup_needed = 1;
- ocfs2_clear_hb_callbacks(osb);
+ if (osb->cconn)
+ ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
debugfs_remove(osb->osb_debug_root);
- if (!mnt_err)
- ocfs2_stop_heartbeat(osb);
+ if (hangup_needed)
+ ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
- printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n",
- osb->dev_str, osb->node_num);
+ if (ocfs2_mount_local(osb))
+ snprintf(nodestr, sizeof(nodestr), "local");
+ else
+ snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
+
+ printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
+ osb->dev_str, nodestr);
ocfs2_delete_osb(osb);
kfree(osb);
@@ -1196,7 +1997,7 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
- osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
+ osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
if (osb->uuid_str == NULL)
return -ENOMEM;
@@ -1212,22 +2013,50 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
return 0;
}
+/* Make sure entire volume is addressable by our journal. Requires
+ osb_clusters_at_boot to be valid and for the journal to have been
+ initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+ int status = 0;
+ u64 max_block =
+ ocfs2_clusters_to_blocks(osb->sb,
+ osb->osb_clusters_at_boot) - 1;
+
+ /* 32-bit block number is always OK. */
+ if (max_block <= (u32)~0ULL)
+ goto out;
+
+ /* Volume is "huge", so see if our journal is new enough to
+ support it. */
+ if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+ jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT))) {
+ mlog(ML_ERROR, "The journal cannot address the entire volume. "
+ "Enable the 'block64' journal option with tunefs.ocfs2");
+ status = -EFBIG;
+ goto out;
+ }
+
+ out:
+ return status;
+}
+
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
- int sector_size)
+ int sector_size,
+ struct ocfs2_blockcheck_stats *stats)
{
- int status = 0;
- int i;
- struct ocfs2_dinode *di = NULL;
+ int status;
+ int i, cbits, bbits;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
- struct buffer_head *bitmap_bh = NULL;
struct ocfs2_journal *journal;
- __le32 uuid_net_key;
struct ocfs2_super *osb;
+ u64 total_blocks;
- mlog_entry_void();
-
- osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
+ osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
if (!osb) {
status = -ENOMEM;
mlog_errno(status);
@@ -1236,30 +2065,40 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
+ sb->s_d_op = &ocfs2_dentry_ops;
sb->s_export_op = &ocfs2_export_ops;
+ sb->s_qcop = &ocfs2_quotactl_ops;
+ sb->dq_op = &ocfs2_quota_operations;
+ sb->s_xattr = ocfs2_xattr_handlers;
+ sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
/* this is needed to support O_LARGEFILE */
- sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
+ cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
+ bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+ sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+
+ osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+
+ for (i = 0; i < 3; i++)
+ osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+ osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
osb->sb = sb;
/* Save off for ocfs2_rw_direct */
osb->s_sectsize_bits = blksize_bits(sector_size);
BUG_ON(!osb->s_sectsize_bits);
- osb->net_response_ids = 0;
- spin_lock_init(&osb->net_response_lock);
- INIT_LIST_HEAD(&osb->net_response_list);
-
- INIT_LIST_HEAD(&osb->osb_net_handlers);
- init_waitqueue_head(&osb->recovery_event);
- spin_lock_init(&osb->vote_task_lock);
- init_waitqueue_head(&osb->vote_event);
- osb->vote_work_sequence = 0;
- osb->vote_wake_sequence = 0;
+ spin_lock_init(&osb->dc_task_lock);
+ init_waitqueue_head(&osb->dc_event);
+ osb->dc_work_sequence = 0;
+ osb->dc_wake_sequence = 0;
INIT_LIST_HEAD(&osb->blocked_lock_list);
osb->blocked_lock_count = 0;
- INIT_LIST_HEAD(&osb->vote_list);
spin_lock_init(&osb->osb_lock);
+ spin_lock_init(&osb->osb_xattr_lock);
+ ocfs2_init_steal_slots(osb);
+
+ mutex_init(&osb->system_file_mutex);
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1267,29 +2106,52 @@ static int ocfs2_initialize_super(struct super_block *sb,
atomic_set(&osb->alloc_stats.bg_allocs, 0);
atomic_set(&osb->alloc_stats.bg_extends, 0);
+ /* Copy the blockcheck stats from the superblock probe */
+ osb->osb_ecc_stats = *stats;
+
ocfs2_init_node_maps(osb);
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
- mutex_init(&osb->recovery_lock);
+ osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+ if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+ mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+ osb->max_slots);
+ status = -EINVAL;
+ goto bail;
+ }
- osb->disable_recovery = 0;
- osb->recovery_thread_task = NULL;
+ ocfs2_orphan_scan_init(osb);
+
+ status = ocfs2_recovery_init(osb);
+ if (status) {
+ mlog(ML_ERROR, "Unable to initialize recovery state\n");
+ mlog_errno(status);
+ goto bail;
+ }
init_waitqueue_head(&osb->checkpoint_event);
- atomic_set(&osb->needs_checkpoint, 0);
- osb->node_num = O2NM_INVALID_NODE_NUM;
+ osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
+
osb->slot_num = OCFS2_INVALID_SLOT;
+ osb->s_xattr_inline_size = le16_to_cpu(
+ di->id2.i_super.s_xattr_inline_size);
+
osb->local_alloc_state = OCFS2_LA_UNUSED;
osb->local_alloc_bh = NULL;
-
- ocfs2_setup_hb_callbacks(osb);
+ INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
init_waitqueue_head(&osb->osb_mount_event);
+ status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -1297,16 +2159,14 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- di = (struct ocfs2_dinode *)bh->b_data;
-
- osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
- if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
- mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
- osb->max_slots);
- status = -EINVAL;
+ osb->slot_recovery_generations =
+ kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+ GFP_KERNEL);
+ if (!osb->slot_recovery_generations) {
+ status = -ENOMEM;
+ mlog_errno(status);
goto bail;
}
- mlog(0, "max_slots for this device: %u\n", osb->max_slots);
init_waitqueue_head(&osb->osb_wipe_event);
osb->osb_orphan_wipes = kcalloc(osb->max_slots,
@@ -1318,6 +2178,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ osb->osb_rf_lock_tree = RB_ROOT;
+
osb->s_feature_compat =
le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
osb->s_feature_ro_compat =
@@ -1339,6 +2201,29 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ if (ocfs2_clusterinfo_valid(osb)) {
+ osb->osb_stackflags =
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
+ strlcpy(osb->osb_cluster_stack,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+ OCFS2_STACK_LABEL_LEN + 1);
+ if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+ mlog(ML_ERROR,
+ "couldn't mount because of an invalid "
+ "cluster stack label (%s) \n",
+ osb->osb_cluster_stack);
+ status = -EINVAL;
+ goto bail;
+ }
+ strlcpy(osb->osb_cluster_name,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+ OCFS2_CLUSTER_NAME_LEN + 1);
+ } else {
+ /* The empty string is identical with classic tools that
+ * don't know about s_cluster_info. */
+ osb->osb_cluster_stack[0] = '\0';
+ }
+
get_random_bytes(&osb->s_next_generation, sizeof(u32));
/* FIXME
@@ -1350,7 +2235,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
*/
/* initialize our journal structure */
- journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
+ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
if (!journal) {
mlog(ML_ERROR, "unable to alloc journal\n");
status = -ENOMEM;
@@ -1365,14 +2250,16 @@ static int ocfs2_initialize_super(struct super_block *sb,
spin_lock_init(&journal->j_lock);
journal->j_trans_id = (unsigned long) 1;
INIT_LIST_HEAD(&journal->j_la_cleanups);
- INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
+ INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
journal->j_state = OCFS2_JOURNAL_FREE;
+ INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
+ init_llist_head(&osb->dquot_drop_list);
+
/* get some pseudo constants for clustersize bits */
osb->s_clustersize_bits =
le32_to_cpu(di->id2.i_super.s_clustersize_bits);
osb->s_clustersize = 1 << osb->s_clustersize_bits;
- mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -1382,11 +2269,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
- > (u32)~0UL) {
- mlog(ML_ERROR, "Volume might try to write to blocks beyond "
- "what jbd can address in 32 bits.\n");
- status = -EINVAL;
+ total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+ le32_to_cpu(di->i_clusters));
+
+ status = generic_check_addressable(osb->sb->s_blocksize_bits,
+ total_blocks);
+ if (status) {
+ mlog(ML_ERROR, "Volume too large "
+ "to mount safely on this system");
+ status = -EFBIG;
goto bail;
}
@@ -1397,21 +2288,18 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
- osb->net_key = le32_to_cpu(uuid_net_key);
-
- strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
- osb->vol_label[63] = '\0';
+ strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno =
le64_to_cpu(di->id2.i_super.s_first_cluster_group);
osb->fs_generation = le32_to_cpu(di->i_fs_generation);
- mlog(0, "vol_label: %s\n", osb->vol_label);
- mlog(0, "uuid: %s\n", osb->uuid_str);
- mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
- (unsigned long long)osb->root_blkno,
- (unsigned long long)osb->system_dir_blkno);
+ osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+ trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+ (unsigned long long)osb->root_blkno,
+ (unsigned long long)osb->system_dir_blkno,
+ osb->s_clustersize_bits);
osb->osb_dlm_debug = ocfs2_new_dlm_debug();
if (!osb->osb_dlm_debug) {
@@ -1441,34 +2329,20 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
-
- /* We don't have a cluster lock on the bitmap here because
- * we're only interested in static information and the extra
- * complexity at mount time isn't worht it. Don't pass the
- * inode in to the read function though as we don't want it to
- * be put in the cache. */
- status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
- NULL);
+ osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
iput(inode);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- di = (struct ocfs2_dinode *) bitmap_bh->b_data;
- osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
- brelse(bitmap_bh);
- mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
- (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
+ osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+ osb->s_feature_incompat) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
bail:
- mlog_exit(status);
return status;
}
@@ -1479,14 +2353,23 @@ bail:
*/
static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct buffer_head *bh,
- u32 blksz)
+ u32 blksz,
+ struct ocfs2_blockcheck_stats *stats)
{
int status = -EAGAIN;
- mlog_entry_void();
-
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+ /* We have to do a raw check of the feature here */
+ if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+ OCFS2_FEATURE_INCOMPAT_META_ECC) {
+ status = ocfs2_block_check_validate(bh->b_data,
+ bh->b_size,
+ &di->i_check,
+ stats);
+ if (status)
+ goto out;
+ }
status = -EINVAL;
if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1506,7 +2389,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
mlog(ML_ERROR, "bad block number on superblock: "
"found %llu, should be %llu\n",
- (unsigned long long)di->i_blkno,
+ (unsigned long long)le64_to_cpu(di->i_blkno),
(unsigned long long)bh->b_blocknr);
} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
@@ -1528,20 +2411,21 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
}
}
- mlog_exit(status);
+out:
+ if (status && status != -EAGAIN)
+ mlog_errno(status);
return status;
}
static int ocfs2_check_volume(struct ocfs2_super *osb)
{
- int status = 0;
+ int status;
int dirty;
+ int local;
struct ocfs2_dinode *local_alloc = NULL; /* only used if we
* recover
* ourselves. */
- mlog_entry_void();
-
/* Init our journal object. */
status = ocfs2_journal_init(osb->journal, &dirty);
if (status < 0) {
@@ -1549,6 +2433,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ /* Now that journal has been initialized, check to make sure
+ entire volume is addressable. */
+ status = ocfs2_journal_addressable(osb);
+ if (status)
+ goto finally;
+
/* If the journal was unmounted cleanly then we don't want to
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
@@ -1559,12 +2449,18 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
+ local = ocfs2_mount_local(osb);
+
/* will play back anything left in the journal. */
- ocfs2_journal_load(osb->journal);
+ status = ocfs2_journal_load(osb->journal, local, dirty);
+ if (status < 0) {
+ mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+ goto finally;
+ }
if (dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
@@ -1579,8 +2475,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves as mounted. */
}
- mlog(0, "Journal loaded.\n");
-
status = ocfs2_load_local_alloc(osb);
if (status < 0) {
mlog_errno(status);
@@ -1599,14 +2493,20 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* lock, and it's marked as dirty, set the bit in the recover
* map and launch a recovery thread for it. */
status = ocfs2_mark_dead_nodes(osb);
+ if (status < 0) {
+ mlog_errno(status);
+ goto finally;
+ }
+
+ status = ocfs2_compute_replay_slots(osb);
if (status < 0)
mlog_errno(status);
finally:
- if (local_alloc)
- kfree(local_alloc);
+ kfree(local_alloc);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1618,27 +2518,22 @@ finally:
*/
static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
- mlog_entry_void();
-
/* This function assumes that the caller has the main osb resource */
- if (osb->slot_info)
- ocfs2_free_slot_info(osb->slot_info);
+ ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
+ kfree(osb->slot_recovery_generations);
/* FIXME
* This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initalize_osb(),
+ * allocate osb->journal at the start of ocfs2_initialize_osb(),
* we free it here.
*/
kfree(osb->journal);
- if (osb->local_alloc_copy)
- kfree(osb->local_alloc_copy);
+ kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
-
- mlog_exit_void();
}
/* Put OCFS2 into a readonly state, or (if the user specifies it),
@@ -1674,7 +2569,7 @@ void __ocfs2_error(struct super_block *sb,
va_list args;
va_start(args, fmt);
- vsprintf(error_buf, fmt, args);
+ vsnprintf(error_buf, sizeof(error_buf), fmt, args);
va_end(args);
/* Not using mlog here because we want to show the actual
@@ -1695,7 +2590,7 @@ void __ocfs2_abort(struct super_block* sb,
va_list args;
va_start(args, fmt);
- vsprintf(error_buf, fmt, args);
+ vsnprintf(error_buf, sizeof(error_buf), fmt, args);
va_end(args);
printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
@@ -1712,9 +2607,30 @@ void __ocfs2_abort(struct super_block* sb,
/* Force a panic(). This stinks, but it's better than letting
* things continue without having a proper hard readonly
* here. */
- OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+ if (!ocfs2_mount_local(OCFS2_SB(sb)))
+ OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
ocfs2_handle_error(sb);
}
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+ int rc;
+ sigset_t blocked;
+
+ sigfillset(&blocked);
+ rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+ BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+ int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+ BUG_ON(rc);
+}
+
module_init(ocfs2_init);
module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a..74ff74cf78f 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,18 +31,23 @@ extern struct workqueue_struct *ocfs2_wq;
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-void __ocfs2_abort(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index c0f68aa6c17..66edce7ecfd 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,9 +38,8 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
-#include <linux/utsname.h>
+#include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -50,130 +49,52 @@
#include "inode.h"
#include "journal.h"
#include "symlink.h"
+#include "xattr.h"
#include "buffer_head_io.h"
-static char *ocfs2_page_getlink(struct dentry * dentry,
- struct page **ppage);
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
- struct buffer_head **bh);
-/* get the link contents into pagecache */
-static char *ocfs2_page_getlink(struct dentry * dentry,
- struct page **ppage)
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
{
- struct page * page;
- struct address_space *mapping = dentry->d_inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- goto sync_fail;
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- goto async_fail;
- *ppage = page;
- return kmap(page);
-
-async_fail:
- page_cache_release(page);
- return ERR_PTR(-EIO);
-
-sync_fail:
- return (char*)page;
-}
-
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
- struct buffer_head **bh)
-{
- int status;
- char *link = NULL;
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *bh = NULL;
+ int status = ocfs2_read_inode_block(inode, &bh);
struct ocfs2_dinode *fe;
+ const char *link;
+ void *kaddr;
+ size_t len;
- mlog_entry_void();
-
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- bh,
- OCFS2_BH_CACHED,
- inode);
if (status < 0) {
mlog_errno(status);
- link = ERR_PTR(status);
- goto bail;
+ return status;
}
- fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ fe = (struct ocfs2_dinode *) bh->b_data;
link = (char *) fe->id2.i_symlink;
-bail:
- mlog_exit(status);
-
- return link;
-}
-
-static int ocfs2_readlink(struct dentry *dentry,
- char __user *buffer,
- int buflen)
-{
- int ret;
- char *link;
- struct buffer_head *bh = NULL;
- struct inode *inode = dentry->d_inode;
-
- mlog_entry_void();
-
- link = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(link)) {
- ret = PTR_ERR(link);
- goto out;
- }
-
- ret = vfs_readlink(dentry, buffer, buflen, link);
-
+ /* will be less than a page size */
+ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, link, len + 1);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ unlock_page(page);
brelse(bh);
-out:
- mlog_exit(ret);
- return ret;
-}
-
-static void *ocfs2_follow_link(struct dentry *dentry,
- struct nameidata *nd)
-{
- int status;
- char *link;
- struct inode *inode = dentry->d_inode;
- struct page *page = NULL;
- struct buffer_head *bh = NULL;
-
- if (ocfs2_inode_is_fast_symlink(inode))
- link = ocfs2_fast_symlink_getlink(inode, &bh);
- else
- link = ocfs2_page_getlink(dentry, &page);
- if (IS_ERR(link)) {
- status = PTR_ERR(link);
- mlog_errno(status);
- goto bail;
- }
-
- status = vfs_follow_link(nd, link);
- if (status && status != -ENOENT)
- mlog_errno(status);
-bail:
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
- if (bh)
- brelse(bh);
-
- return ERR_PTR(status);
+ return 0;
}
-struct inode_operations ocfs2_symlink_inode_operations = {
- .readlink = page_readlink,
- .follow_link = ocfs2_follow_link,
- .getattr = ocfs2_getattr,
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+ .readpage = ocfs2_fast_symlink_readpage,
};
-struct inode_operations ocfs2_fast_symlink_inode_operations = {
- .readlink = ocfs2_readlink,
- .follow_link = ocfs2_follow_link,
+
+const struct inode_operations ocfs2_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
.getattr = ocfs2_getattr,
+ .setattr = ocfs2_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
+ .fiemap = ocfs2_fiemap,
};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 1ea9e4d9e9e..71ee4245e91 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -26,8 +26,8 @@
#ifndef OCFS2_SYMLINK_H
#define OCFS2_SYMLINK_H
-extern struct inode_operations ocfs2_symlink_inode_operations;
-extern struct inode_operations ocfs2_fast_symlink_inode_operations;
+extern const struct inode_operations ocfs2_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
/*
* Test whether an inode is a fast symlink.
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 5df6e35d09b..af155c18312 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,10 +25,8 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -45,10 +43,9 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
+#endif
static inline int is_global_system_inode(int type)
{
@@ -56,11 +53,51 @@ static inline int is_global_system_inode(int type)
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
}
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+ int type,
+ u32 slot)
{
- return slot == osb->slot_num || is_global_system_inode(type);
+ int index;
+ struct inode **local_system_inodes, **free = NULL;
+
+ BUG_ON(slot == OCFS2_INVALID_SLOT);
+ BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+ type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+ spin_lock(&osb->osb_lock);
+ local_system_inodes = osb->local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+
+ if (unlikely(!local_system_inodes)) {
+ local_system_inodes = kzalloc(sizeof(struct inode *) *
+ NUM_LOCAL_SYSTEM_INODES *
+ osb->max_slots,
+ GFP_NOFS);
+ if (!local_system_inodes) {
+ mlog_errno(-ENOMEM);
+ /*
+ * return NULL here so that ocfs2_get_sytem_file_inodes
+ * will try to create an inode and use it. We will try
+ * to initialize local_system_inodes next time.
+ */
+ return NULL;
+ }
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_system_inodes) {
+ /* Someone has initialized it for us. */
+ free = local_system_inodes;
+ local_system_inodes = osb->local_system_inodes;
+ } else
+ osb->local_system_inodes = local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+ kfree(free);
+ }
+
+ index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+ (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+ return &local_system_inodes[index];
}
struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -71,12 +108,16 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
- if (is_in_system_inode_array(osb, type, slot))
- arr = &(osb->system_inodes[type]);
+ if (is_global_system_inode(type)) {
+ arr = &(osb->global_system_inodes[type]);
+ } else
+ arr = get_local_system_inode(osb, type, slot);
+ mutex_lock(&osb->system_file_mutex);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
inode = igrab(inode);
+ mutex_unlock(&osb->system_file_mutex);
BUG_ON(!inode);
return inode;
@@ -90,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
*arr = igrab(inode);
BUG_ON(!*arr);
}
+ mutex_unlock(&osb->system_file_mutex);
return inode;
}
@@ -100,30 +142,41 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
char namebuf[40];
struct inode *inode = NULL;
u64 blkno;
- struct buffer_head *dirent_bh = NULL;
- struct ocfs2_dir_entry *de = NULL;
int status = 0;
ocfs2_sprintf_system_inode_name(namebuf,
sizeof(namebuf),
type, slot);
- status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
- &blkno, osb->sys_root_inode,
- &dirent_bh, &de);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+ strlen(namebuf), &blkno);
if (status < 0) {
goto bail;
}
- inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
+ inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type);
if (IS_ERR(inode)) {
mlog_errno(PTR_ERR(inode));
inode = NULL;
goto bail;
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+ type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
+ type == JOURNAL_SYSTEM_INODE) {
+ /* Ignore inode lock on these inodes as the lock does not
+ * really belong to any process and lockdep cannot handle
+ * that */
+ OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
+ } else {
+ lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
+ l_lockdep_map,
+ ocfs2_system_inodes[type].si_name,
+ &ocfs2_sysfile_cluster_lock_key[type], 0);
+ }
+#endif
bail:
- if (dirent_bh)
- brelse(dirent_bh);
+
return inode;
}
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 9707ed7a320..82e17b076ce 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,9 +53,6 @@
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#include <linux/jbd.h>
-
-#define MLOG_MASK_PREFIX ML_UPTODATE
#include <cluster/masklog.h>
@@ -63,23 +60,86 @@
#include "inode.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
struct ocfs2_meta_cache_item {
struct rb_node c_node;
sector_t c_block;
};
-static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep;
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ return ci->ci_ops->co_owner(ci);
+}
+
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ return ci->ci_ops->co_get_super(ci);
+}
+
+static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_cache_lock(ci);
+}
+
+static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_cache_unlock(ci);
+}
+
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_io_lock(ci);
+}
-void ocfs2_metadata_cache_init(struct inode *inode)
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+ BUG_ON(!ci || !ci->ci_ops);
- oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+ ci->ci_ops->co_io_unlock(ci);
+}
+
+
+static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
+ int clear)
+{
+ ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
ci->ci_num_cached = 0;
+
+ if (clear) {
+ ci->ci_created_trans = 0;
+ ci->ci_last_trans = 0;
+ }
+}
+
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+ const struct ocfs2_caching_operations *ops)
+{
+ BUG_ON(!ops);
+
+ ci->ci_ops = ops;
+ ocfs2_metadata_cache_reset(ci, 1);
}
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
+{
+ ocfs2_metadata_cache_purge(ci);
+ ocfs2_metadata_cache_reset(ci, 1);
+}
+
+
/* No lock taken here as 'root' is not expected to be visible to other
* processes. */
static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
@@ -91,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
while ((node = rb_last(root)) != NULL) {
item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
- mlog(0, "Purge item %llu\n",
- (unsigned long long) item->c_block);
+ trace_ocfs2_purge_copied_metadata_tree(
+ (unsigned long long) item->c_block);
rb_erase(&item->c_node, root);
kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -108,19 +168,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
* This function is a few more lines longer than necessary due to some
* accounting done here, but I think it's worth tracking down those
* bugs sooner -- Mark */
-void ocfs2_metadata_cache_purge(struct inode *inode)
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
unsigned int tree, to_purge, purged;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct rb_root root = RB_ROOT;
- spin_lock(&oi->ip_lock);
- tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ocfs2_metadata_cache_lock(ci);
+ tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
to_purge = ci->ci_num_cached;
- mlog(0, "Purge %u %s items from Inode %llu\n", to_purge,
- tree ? "array" : "tree", (unsigned long long)oi->ip_blkno);
+ trace_ocfs2_metadata_cache_purge(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ to_purge, tree);
/* If we're a tree, save off the root so that we can safely
* initialize the cache. We do the work to free tree members
@@ -128,16 +189,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode)
if (tree)
root = ci->ci_cache.ci_tree;
- ocfs2_metadata_cache_init(inode);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_reset(ci, 0);
+ ocfs2_metadata_cache_unlock(ci);
purged = ocfs2_purge_copied_metadata_tree(&root);
/* If possible, track the number wiped so that we can more
* easily detect counting errors. Unfortunately, this is only
* meaningful for trees. */
if (tree && purged != to_purge)
- mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n",
- (unsigned long long)oi->ip_blkno, to_purge, purged);
+ mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ to_purge, purged);
}
/* Returns the index in the cache array, -1 if not found.
@@ -178,39 +240,37 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
return NULL;
}
-static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
int index = -1;
struct ocfs2_meta_cache_item *item = NULL;
- spin_lock(&oi->ip_lock);
+ ocfs2_metadata_cache_lock(ci);
- mlog(0, "Inode %llu, query block %llu (inline = %u)\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long) bh->b_blocknr,
- !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+ trace_ocfs2_buffer_cached_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long) bh->b_blocknr,
+ !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
- index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
- bh->b_blocknr);
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
+ index = ocfs2_search_cache_array(ci, bh->b_blocknr);
else
- item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
- bh->b_blocknr);
+ item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
- mlog(0, "index = %d, item = %p\n", index, item);
+ trace_ocfs2_buffer_cached_end(index, item);
return (index != -1) || (item != NULL);
}
/* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache.
- *
+ * the block is stored in our inode metadata cache.
+ *
* This can be called under lock_buffer()
*/
-int ocfs2_buffer_uptodate(struct inode *inode,
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
/* Doesn't matter if the bh is in our cache or not -- if it's
@@ -226,27 +286,28 @@ int ocfs2_buffer_uptodate(struct inode *inode,
/* Ok, locally the buffer is marked as up to date, now search
* our cache to see if we can trust that. */
- return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+ return ocfs2_buffer_cached(ci, bh);
}
-/*
+/*
* Determine whether a buffer is currently out on a read-ahead request.
- * ip_io_sem should be held to serialize submitters with the logic here.
+ * ci_io_sem should be held to serialize submitters with the logic here.
*/
-int ocfs2_buffer_read_ahead(struct inode *inode,
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
+ return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
}
/* Requires ip_lock */
static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
sector_t block)
{
- BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
- mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
- ci->ci_num_cached);
+ trace_ocfs2_append_cache_array(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, ci->ci_num_cached);
ci->ci_cache.ci_array[ci->ci_num_cached] = block;
ci->ci_num_cached++;
@@ -263,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
struct ocfs2_meta_cache_item *tmp;
- mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
- ci->ci_num_cached);
+ trace_ocfs2_insert_cache_tree(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, ci->ci_num_cached);
while(*p) {
parent = *p;
@@ -288,67 +350,65 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
ci->ci_num_cached++;
}
-static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
- struct ocfs2_caching_info *ci)
+/* co_cache_lock() must be held */
+static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
{
- assert_spin_locked(&oi->ip_lock);
-
- return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
- (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+ return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
+ (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
}
-/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
* pointers in tree after we use them - this allows caller to detect
- * when to free in case of error. */
-static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+ * when to free in case of error.
+ *
+ * The co_cache_lock() must be held. */
+static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item **tree)
{
int i;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
- mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
- "Inode %llu, num cached = %u, should be %u\n",
- (unsigned long long)oi->ip_blkno, ci->ci_num_cached,
- OCFS2_INODE_MAX_CACHE_ARRAY);
- mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
- "Inode %llu not marked as inline anymore!\n",
- (unsigned long long)oi->ip_blkno);
- assert_spin_locked(&oi->ip_lock);
+ mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
+ "Owner %llu, num cached = %u, should be %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
+ mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
+ "Owner %llu not marked as inline anymore!\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
/* Be careful to initialize the tree members *first* because
* once the ci_tree is used, the array is junk... */
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
tree[i]->c_block = ci->ci_cache.ci_array[i];
- oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+ ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
ci->ci_cache.ci_tree = RB_ROOT;
/* this will be set again by __ocfs2_insert_cache_tree */
ci->ci_num_cached = 0;
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
__ocfs2_insert_cache_tree(ci, tree[i]);
tree[i] = NULL;
}
- mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
- (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+ trace_ocfs2_expand_cache(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_flags, ci->ci_num_cached);
}
/* Slow path function - memory allocation is necessary. See the
* comment above ocfs2_set_buffer_uptodate for more information. */
-static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
sector_t block,
int expand_tree)
{
int i;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct ocfs2_meta_cache_item *new = NULL;
- struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+ struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
{ NULL, };
- mlog(0, "Inode %llu, block %llu, expand = %d\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long)block, expand_tree);
+ trace_ocfs2_set_buffer_uptodate(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, expand_tree);
new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
if (!new) {
@@ -360,7 +420,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
if (expand_tree) {
/* Do *not* allocate an array here - the removal code
* has no way of tracking that. */
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
GFP_NOFS);
if (!tree[i]) {
@@ -372,21 +432,20 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
}
}
- spin_lock(&oi->ip_lock);
- if (ocfs2_insert_can_use_array(oi, ci)) {
- mlog(0, "Someone cleared the tree underneath us\n");
+ ocfs2_metadata_cache_lock(ci);
+ if (ocfs2_insert_can_use_array(ci)) {
/* Ok, items were removed from the cache in between
* locks. Detect this and revert back to the fast path */
ocfs2_append_cache_array(ci, block);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
goto out_free;
}
if (expand_tree)
- ocfs2_expand_cache(oi, tree);
+ ocfs2_expand_cache(ci, tree);
__ocfs2_insert_cache_tree(ci, new);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
new = NULL;
out_free:
@@ -396,14 +455,14 @@ out_free:
/* If these were used, then ocfs2_expand_cache re-set them to
* NULL for us. */
if (tree[0]) {
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
if (tree[i])
kmem_cache_free(ocfs2_uptodate_cachep,
tree[i]);
}
}
-/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
+/* Item insertion is guarded by co_io_lock(), so the insertion path takes
* advantage of this by not rechecking for a duplicate insert during
* the slow case. Additionally, if the cache needs to be bumped up to
* a tree, the code will not recheck after acquiring the lock --
@@ -421,59 +480,55 @@ out_free:
* Readahead buffers can be passed in here before the I/O request is
* completed.
*/
-void ocfs2_set_buffer_uptodate(struct inode *inode,
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
int expand;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
/* The block may very well exist in our cache already, so avoid
* doing any more work in that case. */
- if (ocfs2_buffer_cached(oi, bh))
+ if (ocfs2_buffer_cached(ci, bh))
return;
- mlog(0, "Inode %llu, inserting block %llu\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_set_buffer_uptodate_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr);
/* No need to recheck under spinlock - insertion is guarded by
- * ip_io_mutex */
- spin_lock(&oi->ip_lock);
- if (ocfs2_insert_can_use_array(oi, ci)) {
+ * co_io_lock() */
+ ocfs2_metadata_cache_lock(ci);
+ if (ocfs2_insert_can_use_array(ci)) {
/* Fast case - it's an array and there's a free
* spot. */
ocfs2_append_cache_array(ci, bh->b_blocknr);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
return;
}
expand = 0;
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
/* We need to bump things up to a tree. */
expand = 1;
}
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
- __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+ __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
}
/* Called against a newly allocated buffer. Most likely nobody should
* be able to read this sort of metadata while it's still being
- * allocated, but this is careful to take ip_io_mutex anyway. */
-void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+ * allocated, but this is careful to take co_io_lock() anyway. */
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
/* This should definitely *not* exist in our cache */
- BUG_ON(ocfs2_buffer_cached(oi, bh));
+ BUG_ON(ocfs2_buffer_cached(ci, bh));
set_buffer_uptodate(bh);
- mutex_lock(&oi->ip_io_mutex);
- ocfs2_set_buffer_uptodate(inode, bh);
- mutex_unlock(&oi->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
+ ocfs2_set_buffer_uptodate(ci, bh);
+ ocfs2_metadata_cache_io_unlock(ci);
}
/* Requires ip_lock. */
@@ -483,12 +538,13 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
sector_t *array = ci->ci_cache.ci_array;
int bytes;
- BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
BUG_ON(index >= ci->ci_num_cached);
BUG_ON(!ci->ci_num_cached);
- mlog(0, "remove index %d (num_cached = %u\n", index,
- ci->ci_num_cached);
+ trace_ocfs2_remove_metadata_array(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ index, ci->ci_num_cached);
ci->ci_num_cached--;
@@ -504,32 +560,27 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *item)
{
- mlog(0, "remove block %llu from tree\n",
- (unsigned long long) item->c_block);
+ trace_ocfs2_remove_metadata_tree(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)item->c_block);
rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
ci->ci_num_cached--;
}
-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
- struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
+ sector_t block)
{
int index;
- sector_t block = bh->b_blocknr;
struct ocfs2_meta_cache_item *item = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
- spin_lock(&oi->ip_lock);
- mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long) block, ci->ci_num_cached,
- oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ ocfs2_metadata_cache_lock(ci);
+ trace_ocfs2_remove_block_from_cache(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long) block, ci->ci_num_cached,
+ ci->ci_flags);
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
index = ocfs2_search_cache_array(ci, block);
if (index != -1)
ocfs2_remove_metadata_array(ci, index);
@@ -538,23 +589,45 @@ void ocfs2_remove_from_cache(struct inode *inode,
if (item)
ocfs2_remove_metadata_tree(ci, item);
}
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
if (item)
kmem_cache_free(ocfs2_uptodate_cachep, item);
}
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ sector_t block = bh->b_blocknr;
+
+ ocfs2_remove_block_from_cache(ci, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
+ sector_t block,
+ u32 c_len)
+{
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
+
+ for (i = 0; i < b_len; i++, block++)
+ ocfs2_remove_block_from_cache(ci, block);
+}
+
int __init init_ocfs2_uptodate_cache(void)
{
ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
sizeof(struct ocfs2_meta_cache_item),
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ 0, SLAB_HWCACHE_ALIGN, NULL);
if (!ocfs2_uptodate_cachep)
return -ENOMEM;
- mlog(0, "%u inlined cache items per inode.\n",
- OCFS2_INODE_MAX_CACHE_ARRAY);
-
return 0;
}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a..0d826fe2da0 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -26,21 +26,59 @@
#ifndef OCFS2_UPTODATE_H
#define OCFS2_UPTODATE_H
+/*
+ * The caching code relies on locking provided by the user of
+ * struct ocfs2_caching_info. These operations connect that up.
+ */
+struct ocfs2_caching_operations {
+ /*
+ * A u64 representing the owning structure. Usually this
+ * is the block number (i_blkno or whatnot). This is used so
+ * that caching log messages can identify the owning structure.
+ */
+ u64 (*co_owner)(struct ocfs2_caching_info *ci);
+
+ /* The superblock is needed during I/O. */
+ struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
+ /*
+ * Lock and unlock the caching data. These will not sleep, and
+ * should probably be spinlocks.
+ */
+ void (*co_cache_lock)(struct ocfs2_caching_info *ci);
+ void (*co_cache_unlock)(struct ocfs2_caching_info *ci);
+
+ /*
+ * Lock and unlock for disk I/O. These will sleep, and should
+ * be mutexes.
+ */
+ void (*co_io_lock)(struct ocfs2_caching_info *ci);
+ void (*co_io_unlock)(struct ocfs2_caching_info *ci);
+};
+
int __init init_ocfs2_uptodate_cache(void);
void exit_ocfs2_uptodate_cache(void);
-void ocfs2_metadata_cache_init(struct inode *inode);
-void ocfs2_metadata_cache_purge(struct inode *inode);
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+ const struct ocfs2_caching_operations *ops);
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
-int ocfs2_buffer_uptodate(struct inode *inode,
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_set_buffer_uptodate(struct inode *inode,
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_remove_from_cache(struct inode *inode,
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-int ocfs2_buffer_read_ahead(struct inode *inode,
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
+ sector_t block,
+ u32 c_len);
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index 5405ce121c9..00000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.3.3"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
deleted file mode 100644
index 5b4dca79990..00000000000
--- a/fs/ocfs2/vote.c
+++ /dev/null
@@ -1,1027 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.c
- *
- * description here
- *
- * Copyright (C) 2003, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/smp_lock.h>
-#include <linux/kthread.h>
-
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
-#include <dlm/dlmapi.h>
-
-#define MLOG_MASK_PREFIX ML_VOTE
-#include <cluster/masklog.h>
-
-#include "ocfs2.h"
-
-#include "alloc.h"
-#include "dlmglue.h"
-#include "extent_map.h"
-#include "heartbeat.h"
-#include "inode.h"
-#include "journal.h"
-#include "slot_map.h"
-#include "vote.h"
-
-#include "buffer_head_io.h"
-
-#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
-#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
-struct ocfs2_msg_hdr
-{
- __be32 h_response_id; /* used to lookup message handle on sending
- * node. */
- __be32 h_request;
- __be64 h_blkno;
- __be32 h_generation;
- __be32 h_node_num; /* node sending this particular message. */
-};
-
-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
-struct ocfs2_vote_msg
-{
- struct ocfs2_msg_hdr v_hdr;
- union {
- __be32 v_generic1;
- __be32 v_orphaned_slot; /* Used during delete votes */
- __be32 v_nlink; /* Used during unlink votes */
- } md1; /* Message type dependant 1 */
-};
-
-/* Responses are given these values to maintain backwards
- * compatibility with older ocfs2 versions */
-#define OCFS2_RESPONSE_OK (0)
-#define OCFS2_RESPONSE_BUSY (-16)
-#define OCFS2_RESPONSE_BAD_MSG (-22)
-
-struct ocfs2_response_msg
-{
- struct ocfs2_msg_hdr r_hdr;
- __be32 r_response;
- __be32 r_orphaned_slot;
-};
-
-struct ocfs2_vote_work {
- struct list_head w_list;
- struct ocfs2_vote_msg w_msg;
-};
-
-enum ocfs2_vote_request {
- OCFS2_VOTE_REQ_INVALID = 0,
- OCFS2_VOTE_REQ_DELETE,
- OCFS2_VOTE_REQ_MOUNT,
- OCFS2_VOTE_REQ_UMOUNT,
- OCFS2_VOTE_REQ_LAST
-};
-
-static inline int ocfs2_is_valid_vote_request(int request)
-{
- return OCFS2_VOTE_REQ_INVALID < request &&
- request < OCFS2_VOTE_REQ_LAST;
-}
-
-typedef void (*ocfs2_net_response_callback)(void *priv,
- struct ocfs2_response_msg *resp);
-struct ocfs2_net_response_cb {
- ocfs2_net_response_callback rc_cb;
- void *rc_priv;
-};
-
-struct ocfs2_net_wait_ctxt {
- struct list_head n_list;
- u32 n_response_id;
- wait_queue_head_t n_event;
- struct ocfs2_node_map n_node_map;
- int n_response; /* an agreggate response. 0 if
- * all nodes are go, < 0 on any
- * negative response from any
- * node or network error. */
- struct ocfs2_net_response_cb *n_callback;
-};
-
-static void ocfs2_process_mount_request(struct ocfs2_super *osb,
- unsigned int node_num)
-{
- mlog(0, "MOUNT vote from node %u\n", node_num);
- /* The other node only sends us this message when he has an EX
- * on the superblock, so our recovery threads (if having been
- * launched) are waiting on it.*/
- ocfs2_recovery_map_clear(osb, node_num);
- ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
-
- /* We clear the umount map here because a node may have been
- * previously mounted, safely unmounted but never stopped
- * heartbeating - in which case we'd have a stale entry. */
- ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
-}
-
-static void ocfs2_process_umount_request(struct ocfs2_super *osb,
- unsigned int node_num)
-{
- mlog(0, "UMOUNT vote from node %u\n", node_num);
- ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
- ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
-}
-
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
- assert_spin_locked(&oi->ip_lock);
- /* We set the SKIP_DELETE flag on the inode so we don't try to
- * delete it in delete_inode ourselves, thus avoiding
- * unecessary lock pinging. If the other node failed to wipe
- * the inode as a result of a crash, then recovery will pick
- * up the slack. */
- oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-
-static int ocfs2_process_delete_request(struct inode *inode,
- int *orphaned_slot)
-{
- int response = OCFS2_RESPONSE_BUSY;
-
- mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
- inode->i_ino, inode->i_nlink, *orphaned_slot);
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
-
- /* Whatever our vote response is, we want to make sure that
- * the orphaned slot is recorded properly on this node *and*
- * on the requesting node. Technically, if the requesting node
- * did not know which slot the inode is orphaned in but we
- * respond with BUSY he doesn't actually need the orphaned
- * slot, but it doesn't hurt to do it here anyway. */
- if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
- mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
- OCFS2_INVALID_SLOT &&
- OCFS2_I(inode)->ip_orphaned_slot !=
- (*orphaned_slot),
- "Inode %llu: This node thinks it's "
- "orphaned in slot %d, messaged it's in %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- OCFS2_I(inode)->ip_orphaned_slot,
- *orphaned_slot);
-
- mlog(0, "Setting orphaned slot for inode %llu to %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- *orphaned_slot);
-
- OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
- } else {
- mlog(0, "Sending back orphaned slot %d for inode %llu\n",
- OCFS2_I(inode)->ip_orphaned_slot,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
- *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
- }
-
- /* vote no if the file is still open. */
- if (OCFS2_I(inode)->ip_open_count) {
- mlog(0, "open count = %u\n",
- OCFS2_I(inode)->ip_open_count);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- goto done;
- }
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- /* directories are a bit ugly... What if someone is sitting in
- * it? We want to make sure the inode is removed completely as
- * a result of the iput in process_vote. */
- if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
- mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
- goto done;
- }
-
- if (filemap_fdatawrite(inode->i_mapping)) {
- mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- goto done;
- }
- sync_mapping_buffers(inode->i_mapping);
- truncate_inode_pages(inode->i_mapping, 0);
- ocfs2_extent_map_trunc(inode, 0);
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- /* double check open count - someone might have raced this
- * thread into ocfs2_file_open while we were writing out
- * data. If we're to allow a wipe of this inode now, we *must*
- * hold the spinlock until we've marked it. */
- if (OCFS2_I(inode)->ip_open_count) {
- mlog(0, "Raced to wipe! open count = %u\n",
- OCFS2_I(inode)->ip_open_count);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- goto done;
- }
-
- /* Mark the inode as being wiped from disk. */
- ocfs2_mark_inode_remotely_deleted(inode);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- /* Not sure this is necessary anymore. */
- d_prune_aliases(inode);
-
- /* If we get here, then we're voting 'yes', so commit the
- * delete on our side. */
- response = OCFS2_RESPONSE_OK;
-done:
- return response;
-}
-
-static void ocfs2_process_vote(struct ocfs2_super *osb,
- struct ocfs2_vote_msg *msg)
-{
- int net_status, vote_response;
- int orphaned_slot = 0;
- unsigned int node_num, generation;
- u64 blkno;
- enum ocfs2_vote_request request;
- struct inode *inode = NULL;
- struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
- struct ocfs2_response_msg response;
-
- /* decode the network mumbo jumbo into local variables. */
- request = be32_to_cpu(hdr->h_request);
- blkno = be64_to_cpu(hdr->h_blkno);
- generation = be32_to_cpu(hdr->h_generation);
- node_num = be32_to_cpu(hdr->h_node_num);
- if (request == OCFS2_VOTE_REQ_DELETE)
- orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
-
- mlog(0, "processing vote: request = %u, blkno = %llu, "
- "generation = %u, node_num = %u, priv1 = %u\n", request,
- (unsigned long long)blkno, generation, node_num,
- be32_to_cpu(msg->md1.v_generic1));
-
- if (!ocfs2_is_valid_vote_request(request)) {
- mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
- request, node_num);
- vote_response = OCFS2_RESPONSE_BAD_MSG;
- goto respond;
- }
-
- vote_response = OCFS2_RESPONSE_OK;
-
- switch (request) {
- case OCFS2_VOTE_REQ_UMOUNT:
- ocfs2_process_umount_request(osb, node_num);
- goto respond;
- case OCFS2_VOTE_REQ_MOUNT:
- ocfs2_process_mount_request(osb, node_num);
- goto respond;
- default:
- /* avoids a gcc warning */
- break;
- }
-
- /* We cannot process the remaining message types before we're
- * fully mounted. It's perfectly safe however to send a 'yes'
- * response as we can't possibly have any of the state they're
- * asking us to modify yet. */
- if (atomic_read(&osb->vol_state) == VOLUME_INIT)
- goto respond;
-
- /* If we get here, then the request is against an inode. */
- inode = ocfs2_ilookup_for_vote(osb, blkno,
- request == OCFS2_VOTE_REQ_DELETE);
-
- /* Not finding the inode is perfectly valid - it means we're
- * not interested in what the other node is about to do to it
- * so in those cases we automatically respond with an
- * affirmative. Cluster locking ensures that we won't race
- * interest in the inode with this vote request. */
- if (!inode)
- goto respond;
-
- /* Check generation values. It's possible for us to get a
- * request against a stale inode. If so then we proceed as if
- * we had not found an inode in the first place. */
- if (inode->i_generation != generation) {
- mlog(0, "generation passed %u != inode generation = %u, "
- "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
- "message type = %u\n", generation, inode->i_generation,
- OCFS2_I(inode)->ip_flags,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, atomic_read(&inode->i_count),
- request);
- iput(inode);
- inode = NULL;
- goto respond;
- }
-
- switch (request) {
- case OCFS2_VOTE_REQ_DELETE:
- vote_response = ocfs2_process_delete_request(inode,
- &orphaned_slot);
- break;
- default:
- mlog(ML_ERROR, "node %u, invalid request: %u\n",
- node_num, request);
- vote_response = OCFS2_RESPONSE_BAD_MSG;
- }
-
-respond:
- /* Response struture is small so we just put it on the stack
- * and stuff it inline. */
- memset(&response, 0, sizeof(struct ocfs2_response_msg));
- response.r_hdr.h_response_id = hdr->h_response_id;
- response.r_hdr.h_blkno = hdr->h_blkno;
- response.r_hdr.h_generation = hdr->h_generation;
- response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
- response.r_response = cpu_to_be32(vote_response);
- response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
-
- net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
- osb->net_key,
- &response,
- sizeof(struct ocfs2_response_msg),
- node_num,
- NULL);
- /* We still want to error print for ENOPROTOOPT here. The
- * sending node shouldn't have unregistered his net handler
- * without sending an unmount vote 1st */
- if (net_status < 0
- && net_status != -ETIMEDOUT
- && net_status != -ENOTCONN)
- mlog(ML_ERROR, "message to node %u fails with error %d!\n",
- node_num, net_status);
-
- if (inode)
- iput(inode);
-}
-
-static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
-{
- unsigned long processed;
- struct ocfs2_lock_res *lockres;
- struct ocfs2_vote_work *work;
-
- mlog_entry_void();
-
- spin_lock(&osb->vote_task_lock);
- /* grab this early so we know to try again if a state change and
- * wake happens part-way through our work */
- osb->vote_work_sequence = osb->vote_wake_sequence;
-
- processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
- lockres = list_entry(osb->blocked_lock_list.next,
- struct ocfs2_lock_res, l_blocked_list);
- list_del_init(&lockres->l_blocked_list);
- osb->blocked_lock_count--;
- spin_unlock(&osb->vote_task_lock);
-
- BUG_ON(!processed);
- processed--;
-
- ocfs2_process_blocked_lock(osb, lockres);
-
- spin_lock(&osb->vote_task_lock);
- }
-
- while (osb->vote_count) {
- BUG_ON(list_empty(&osb->vote_list));
- work = list_entry(osb->vote_list.next,
- struct ocfs2_vote_work, w_list);
- list_del(&work->w_list);
- osb->vote_count--;
- spin_unlock(&osb->vote_task_lock);
-
- ocfs2_process_vote(osb, &work->w_msg);
- kfree(work);
-
- spin_lock(&osb->vote_task_lock);
- }
- spin_unlock(&osb->vote_task_lock);
-
- mlog_exit_void();
-}
-
-static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
-{
- int empty = 0;
-
- spin_lock(&osb->vote_task_lock);
- if (list_empty(&osb->blocked_lock_list) &&
- list_empty(&osb->vote_list))
- empty = 1;
-
- spin_unlock(&osb->vote_task_lock);
- return empty;
-}
-
-static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
-{
- int should_wake = 0;
-
- spin_lock(&osb->vote_task_lock);
- if (osb->vote_work_sequence != osb->vote_wake_sequence)
- should_wake = 1;
- spin_unlock(&osb->vote_task_lock);
-
- return should_wake;
-}
-
-int ocfs2_vote_thread(void *arg)
-{
- int status = 0;
- struct ocfs2_super *osb = arg;
-
- /* only quit once we've been asked to stop and there is no more
- * work available */
- while (!(kthread_should_stop() &&
- ocfs2_vote_thread_lists_empty(osb))) {
-
- wait_event_interruptible(osb->vote_event,
- ocfs2_vote_thread_should_wake(osb) ||
- kthread_should_stop());
-
- mlog(0, "vote_thread: awoken\n");
-
- ocfs2_vote_thread_do_work(osb);
- }
-
- osb->vote_task = NULL;
- return status;
-}
-
-static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
-{
- struct ocfs2_net_wait_ctxt *w;
-
- w = kcalloc(1, sizeof(*w), GFP_NOFS);
- if (!w) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
-
- INIT_LIST_HEAD(&w->n_list);
- init_waitqueue_head(&w->n_event);
- ocfs2_node_map_init(&w->n_node_map);
- w->n_response_id = response_id;
- w->n_callback = NULL;
-bail:
- return w;
-}
-
-static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
-{
- unsigned int ret;
-
- spin_lock(&osb->net_response_lock);
- ret = ++osb->net_response_ids;
- spin_unlock(&osb->net_response_lock);
-
- return ret;
-}
-
-static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
- struct ocfs2_net_wait_ctxt *w)
-{
- spin_lock(&osb->net_response_lock);
- list_del(&w->n_list);
- spin_unlock(&osb->net_response_lock);
-}
-
-static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
- struct ocfs2_net_wait_ctxt *w)
-{
- spin_lock(&osb->net_response_lock);
- list_add_tail(&w->n_list,
- &osb->net_response_list);
- spin_unlock(&osb->net_response_lock);
-}
-
-static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
- struct ocfs2_net_wait_ctxt *w,
- int node_num)
-{
- assert_spin_locked(&osb->net_response_lock);
-
- ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
- if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
- wake_up(&w->n_event);
-}
-
-/* Intended to be called from the node down callback, we fake remove
- * the node from all our response contexts */
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
- int node_num)
-{
- struct list_head *p;
- struct ocfs2_net_wait_ctxt *w = NULL;
-
- spin_lock(&osb->net_response_lock);
-
- list_for_each(p, &osb->net_response_list) {
- w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
-
- __ocfs2_mark_node_responded(osb, w, node_num);
- }
-
- spin_unlock(&osb->net_response_lock);
-}
-
-static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
- struct ocfs2_vote_msg *request,
- unsigned int response_id,
- int *response,
- struct ocfs2_net_response_cb *callback)
-{
- int status, i, remote_err;
- struct ocfs2_net_wait_ctxt *w = NULL;
- int dequeued = 0;
-
- mlog_entry_void();
-
- w = ocfs2_new_net_wait_ctxt(response_id);
- if (!w) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
- w->n_callback = callback;
-
- /* we're pretty much ready to go at this point, and this fills
- * in n_response which we need anyway... */
- ocfs2_queue_net_wait_ctxt(osb, w);
-
- i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
-
- while (i != O2NM_INVALID_NODE_NUM) {
- if (i != osb->node_num) {
- mlog(0, "trying to send request to node %i\n", i);
- ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
-
- remote_err = 0;
- status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
- osb->net_key,
- request,
- sizeof(*request),
- i,
- &remote_err);
- if (status == -ETIMEDOUT) {
- mlog(0, "remote node %d timed out!\n", i);
- status = -EAGAIN;
- goto bail;
- }
- if (remote_err < 0) {
- status = remote_err;
- mlog(0, "remote error %d on node %d!\n",
- remote_err, i);
- mlog_errno(status);
- goto bail;
- }
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
- i++;
- i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
- mlog(0, "next is %d, i am %d\n", i, osb->node_num);
- }
- mlog(0, "done sending, now waiting on responses...\n");
-
- wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
-
- ocfs2_dequeue_net_wait_ctxt(osb, w);
- dequeued = 1;
-
- *response = w->n_response;
- status = 0;
-bail:
- if (w) {
- if (!dequeued)
- ocfs2_dequeue_net_wait_ctxt(osb, w);
- kfree(w);
- }
-
- mlog_exit(status);
- return status;
-}
-
-static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
- u64 blkno,
- unsigned int generation,
- enum ocfs2_vote_request type,
- u32 priv)
-{
- struct ocfs2_vote_msg *request;
- struct ocfs2_msg_hdr *hdr;
-
- BUG_ON(!ocfs2_is_valid_vote_request(type));
-
- request = kcalloc(1, sizeof(*request), GFP_NOFS);
- if (!request) {
- mlog_errno(-ENOMEM);
- } else {
- hdr = &request->v_hdr;
- hdr->h_node_num = cpu_to_be32(osb->node_num);
- hdr->h_request = cpu_to_be32(type);
- hdr->h_blkno = cpu_to_be64(blkno);
- hdr->h_generation = cpu_to_be32(generation);
-
- request->md1.v_generic1 = cpu_to_be32(priv);
- }
-
- return request;
-}
-
-/* Complete the buildup of a new vote request and process the
- * broadcast return value. */
-static int ocfs2_do_request_vote(struct ocfs2_super *osb,
- struct ocfs2_vote_msg *request,
- struct ocfs2_net_response_cb *callback)
-{
- int status, response;
- unsigned int response_id;
- struct ocfs2_msg_hdr *hdr;
-
- response_id = ocfs2_new_response_id(osb);
-
- hdr = &request->v_hdr;
- hdr->h_response_id = cpu_to_be32(response_id);
-
- status = ocfs2_broadcast_vote(osb, request, response_id, &response,
- callback);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- status = response;
-bail:
-
- return status;
-}
-
-static int ocfs2_request_vote(struct inode *inode,
- struct ocfs2_vote_msg *request,
- struct ocfs2_net_response_cb *callback)
-{
- int status;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
- if (ocfs2_inode_is_new(inode))
- return 0;
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
- signal_pending(current))
- return -ERESTARTSYS;
-
- status = ocfs2_super_lock(osb, 0);
- if (status < 0) {
- mlog_errno(status);
- break;
- }
-
- status = 0;
- if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
- osb->node_num))
- status = ocfs2_do_request_vote(osb, request, callback);
-
- ocfs2_super_unlock(osb, 0);
- }
- return status;
-}
-
-static void ocfs2_delete_response_cb(void *priv,
- struct ocfs2_response_msg *resp)
-{
- int orphaned_slot, node;
- struct inode *inode = priv;
-
- orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
- node = be32_to_cpu(resp->r_hdr.h_node_num);
- mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
- node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
- orphaned_slot);
-
- /* The other node may not actually know which slot the inode
- * is orphaned in. */
- if (orphaned_slot == OCFS2_INVALID_SLOT)
- return;
-
- /* Ok, the responding node knows which slot this inode is
- * orphaned in. We verify that the information is correct and
- * then record this in the inode. ocfs2_delete_inode will use
- * this information to determine which lock to take. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
- OCFS2_I(inode)->ip_orphaned_slot
- != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
- "orphaned in slot %d, we think it's in %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- be32_to_cpu(resp->r_hdr.h_node_num),
- orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-
- OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
-int ocfs2_request_delete_vote(struct inode *inode)
-{
- int orphaned_slot, status;
- struct ocfs2_net_response_cb delete_cb;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_vote_msg *request;
-
- spin_lock(&OCFS2_I(inode)->ip_lock);
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-
- delete_cb.rc_cb = ocfs2_delete_response_cb;
- delete_cb.rc_priv = inode;
-
- mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-
- status = -ENOMEM;
- request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
- inode->i_generation,
- OCFS2_VOTE_REQ_DELETE, orphaned_slot);
- if (request) {
- status = ocfs2_request_vote(inode, request, &delete_cb);
-
- kfree(request);
- }
-
- return status;
-}
-
-int ocfs2_request_mount_vote(struct ocfs2_super *osb)
-{
- int status;
- struct ocfs2_vote_msg *request = NULL;
-
- request = ocfs2_new_vote_request(osb, 0ULL, 0,
- OCFS2_VOTE_REQ_MOUNT, 0);
- if (!request) {
- status = -ENOMEM;
- goto bail;
- }
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
- signal_pending(current)) {
- status = -ERESTARTSYS;
- goto bail;
- }
-
- if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
- osb->node_num)) {
- status = 0;
- goto bail;
- }
-
- status = ocfs2_do_request_vote(osb, request, NULL);
- }
-
-bail:
- kfree(request);
- return status;
-}
-
-int ocfs2_request_umount_vote(struct ocfs2_super *osb)
-{
- int status;
- struct ocfs2_vote_msg *request = NULL;
-
- request = ocfs2_new_vote_request(osb, 0ULL, 0,
- OCFS2_VOTE_REQ_UMOUNT, 0);
- if (!request) {
- status = -ENOMEM;
- goto bail;
- }
-
- status = -EAGAIN;
- while (status == -EAGAIN) {
- /* Do not check signals on this vote... We really want
- * this one to go all the way through. */
-
- if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
- osb->node_num)) {
- status = 0;
- goto bail;
- }
-
- status = ocfs2_do_request_vote(osb, request, NULL);
- }
-
-bail:
- kfree(request);
- return status;
-}
-
-/* TODO: This should eventually be a hash table! */
-static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
- u32 response_id)
-{
- struct list_head *p;
- struct ocfs2_net_wait_ctxt *w = NULL;
-
- list_for_each(p, &osb->net_response_list) {
- w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
- if (response_id == w->n_response_id)
- break;
- w = NULL;
- }
-
- return w;
-}
-
-/* Translate response codes into local node errno values */
-static inline int ocfs2_translate_response(int response)
-{
- int ret;
-
- switch (response) {
- case OCFS2_RESPONSE_OK:
- ret = 0;
- break;
-
- case OCFS2_RESPONSE_BUSY:
- ret = -EBUSY;
- break;
-
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
-static int ocfs2_handle_response_message(struct o2net_msg *msg,
- u32 len,
- void *data)
-{
- unsigned int response_id, node_num;
- int response_status;
- struct ocfs2_super *osb = data;
- struct ocfs2_response_msg *resp;
- struct ocfs2_net_wait_ctxt * w;
- struct ocfs2_net_response_cb *resp_cb;
-
- resp = (struct ocfs2_response_msg *) msg->buf;
-
- response_id = be32_to_cpu(resp->r_hdr.h_response_id);
- node_num = be32_to_cpu(resp->r_hdr.h_node_num);
- response_status =
- ocfs2_translate_response(be32_to_cpu(resp->r_response));
-
- mlog(0, "received response message:\n");
- mlog(0, "h_response_id = %u\n", response_id);
- mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
- mlog(0, "h_blkno = %llu\n",
- (unsigned long long)be64_to_cpu(resp->r_hdr.h_blkno));
- mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
- mlog(0, "h_node_num = %u\n", node_num);
- mlog(0, "r_response = %d\n", response_status);
-
- spin_lock(&osb->net_response_lock);
- w = __ocfs2_find_net_wait_ctxt(osb, response_id);
- if (!w) {
- mlog(0, "request not found!\n");
- goto bail;
- }
- resp_cb = w->n_callback;
-
- if (response_status && (!w->n_response)) {
- /* we only really need one negative response so don't
- * set it twice. */
- w->n_response = response_status;
- }
-
- if (resp_cb) {
- spin_unlock(&osb->net_response_lock);
-
- resp_cb->rc_cb(resp_cb->rc_priv, resp);
-
- spin_lock(&osb->net_response_lock);
- }
-
- __ocfs2_mark_node_responded(osb, w, node_num);
-bail:
- spin_unlock(&osb->net_response_lock);
-
- return 0;
-}
-
-static int ocfs2_handle_vote_message(struct o2net_msg *msg,
- u32 len,
- void *data)
-{
- int status;
- struct ocfs2_super *osb = data;
- struct ocfs2_vote_work *work;
-
- work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
- if (!work) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- INIT_LIST_HEAD(&work->w_list);
- memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
-
- mlog(0, "scheduling vote request:\n");
- mlog(0, "h_response_id = %u\n",
- be32_to_cpu(work->w_msg.v_hdr.h_response_id));
- mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
- mlog(0, "h_blkno = %llu\n",
- (unsigned long long)be64_to_cpu(work->w_msg.v_hdr.h_blkno));
- mlog(0, "h_generation = %u\n",
- be32_to_cpu(work->w_msg.v_hdr.h_generation));
- mlog(0, "h_node_num = %u\n",
- be32_to_cpu(work->w_msg.v_hdr.h_node_num));
- mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
-
- spin_lock(&osb->vote_task_lock);
- list_add_tail(&work->w_list, &osb->vote_list);
- osb->vote_count++;
- spin_unlock(&osb->vote_task_lock);
-
- ocfs2_kick_vote_thread(osb);
-
- status = 0;
-bail:
- return status;
-}
-
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
-{
- if (!osb->net_key)
- return;
-
- o2net_unregister_handler_list(&osb->osb_net_handlers);
-
- if (!list_empty(&osb->net_response_list))
- mlog(ML_ERROR, "net response list not empty!\n");
-
- osb->net_key = 0;
-}
-
-int ocfs2_register_net_handlers(struct ocfs2_super *osb)
-{
- int status = 0;
-
- status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
- osb->net_key,
- sizeof(struct ocfs2_response_msg),
- ocfs2_handle_response_message,
- osb, &osb->osb_net_handlers);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
- osb->net_key,
- sizeof(struct ocfs2_vote_msg),
- ocfs2_handle_vote_message,
- osb, &osb->osb_net_handlers);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-bail:
- if (status < 0)
- ocfs2_unregister_net_handlers(osb);
-
- return status;
-}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
deleted file mode 100644
index 53ebc1c69e5..00000000000
--- a/fs/ocfs2/vote.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * vote.h
- *
- * description here
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-
-#ifndef VOTE_H
-#define VOTE_H
-
-int ocfs2_vote_thread(void *arg);
-static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
-{
- spin_lock(&osb->vote_task_lock);
- /* make sure the voting thread gets a swipe at whatever changes
- * the caller may have made to the voting state */
- osb->vote_wake_sequence++;
- spin_unlock(&osb->vote_task_lock);
- wake_up(&osb->vote_event);
-}
-
-int ocfs2_request_delete_vote(struct inode *inode);
-int ocfs2_request_mount_vote(struct ocfs2_super *osb);
-int ocfs2_request_umount_vote(struct ocfs2_super *osb);
-int ocfs2_register_net_handlers(struct ocfs2_super *osb);
-void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
-
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
-
-void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
- int node_num);
-#endif
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 00000000000..016f01df382
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,7427 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/security.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+#include "refcounttree.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
+
+struct ocfs2_xattr_def_value_root {
+ struct ocfs2_xattr_value_root xv;
+ struct ocfs2_extent_rec er;
+};
+
+struct ocfs2_xattr_bucket {
+ /* The inode these xattrs are associated with */
+ struct inode *bu_inode;
+
+ /* The actual buffers that make up the bucket */
+ struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+ /* How many blocks make up one bucket for this filesystem */
+ int bu_blocks;
+};
+
+struct ocfs2_xattr_set_ctxt {
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ int set_abort;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_HEADER_GAP 4
+#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
+ - sizeof(struct ocfs2_xattr_header) \
+ - OCFS2_XATTR_HEADER_GAP)
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
+ - sizeof(struct ocfs2_xattr_block) \
+ - sizeof(struct ocfs2_xattr_header) \
+ - OCFS2_XATTR_HEADER_GAP)
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+ .xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
+ &ocfs2_xattr_user_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ &ocfs2_xattr_trusted_handler,
+ &ocfs2_xattr_security_handler,
+ NULL
+};
+
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+ [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
+ [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+ = &posix_acl_access_xattr_handler,
+ [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+ = &posix_acl_default_xattr_handler,
+ [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
+ [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
+};
+
+struct ocfs2_xattr_info {
+ int xi_name_index;
+ const char *xi_name;
+ int xi_name_len;
+ const void *xi_value;
+ size_t xi_value_len;
+};
+
+struct ocfs2_xattr_search {
+ struct buffer_head *inode_bh;
+ /*
+ * xattr_bh point to the block buffer head which has extended attribute
+ * when extended attribute in inode, xattr_bh is equal to inode_bh.
+ */
+ struct buffer_head *xattr_bh;
+ struct ocfs2_xattr_header *header;
+ struct ocfs2_xattr_bucket *bucket;
+ void *base;
+ void *end;
+ struct ocfs2_xattr_entry *here;
+ int not_found;
+};
+
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+ /*
+ * Journal functions
+ */
+ int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+ int type);
+ void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+
+ /*
+ * Return a pointer to the appropriate buffer in loc->xl_storage
+ * at the given offset from loc->xl_header.
+ */
+ void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+
+ /* Can we reuse the existing entry for the new value? */
+ int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi);
+
+ /* How much space is needed for the new value? */
+ int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi);
+
+ /*
+ * Return the offset of the first name+value pair. This is
+ * the start of our downward-filling free space.
+ */
+ int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+
+ /*
+ * Remove the name+value at this location. Do whatever is
+ * appropriate with the remaining name+value pairs.
+ */
+ void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+
+ /* Fill xl_entry with a new entry */
+ void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+
+ /* Add name+value storage to an entry */
+ void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+
+ /*
+ * Initialize the value buf's access and bh fields for this entry.
+ * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+ */
+ void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb);
+};
+
+/*
+ * Describes an xattr entry location. This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+ /* This xattr belongs to this inode */
+ struct inode *xl_inode;
+
+ /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+ struct ocfs2_xattr_header *xl_header;
+
+ /* Bytes from xl_header to the end of the storage */
+ int xl_size;
+
+ /*
+ * The ocfs2_xattr_entry this location describes. If this is
+ * NULL, this location describes the on-disk structure where it
+ * would have been.
+ */
+ struct ocfs2_xattr_entry *xl_entry;
+
+ /*
+ * Internal housekeeping
+ */
+
+ /* Buffer(s) containing this entry */
+ void *xl_storage;
+
+ /* Operations on the storage backing this location */
+ const struct ocfs2_xa_loc_operations *xl_ops;
+};
+
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+ if (value_len > OCFS2_XATTR_INLINE_SIZE)
+ return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ else
+ return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+ return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+ u64 value_len = le64_to_cpu(xe->xe_value_size);
+
+ BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+ ocfs2_xattr_is_local(xe));
+ return namevalue_size(xe->xe_name_len, value_len);
+}
+
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
+ struct ocfs2_xattr_header *xh,
+ int index,
+ int *block_off,
+ int *new_offset);
+
+static int ocfs2_xattr_block_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs);
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+ struct buffer_head *root_bh,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ char *buffer,
+ size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt);
+
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+ struct buffer_head *root_bh,
+ xattr_tree_rec_func *rec_func,
+ void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para);
+
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+ u64 src_blk, u64 last_blk, u64 to_blk,
+ unsigned int start_bucket,
+ u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_refcount_tree **ref_tree,
+ int *meta_need,
+ int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+ struct ocfs2_xattr_bucket *bucket,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **bh);
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+ return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+ return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+ struct ocfs2_xattr_bucket *bucket;
+ int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+ bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+ if (bucket) {
+ bucket->bu_inode = inode;
+ bucket->bu_blocks = blks;
+ }
+
+ return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+ int i;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ brelse(bucket->bu_bhs[i]);
+ bucket->bu_bhs[i] = NULL;
+ }
+}
+
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+ if (bucket) {
+ ocfs2_xattr_bucket_relse(bucket);
+ bucket->bu_inode = NULL;
+ kfree(bucket);
+ }
+}
+
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read. We just need the buffer_heads. Don't call this for
+ * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+ u64 xb_blkno, int new)
+{
+ int i, rc = 0;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+ xb_blkno + i);
+ if (!bucket->bu_bhs[i]) {
+ rc = -ENOMEM;
+ mlog_errno(rc);
+ break;
+ }
+
+ if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i])) {
+ if (new)
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ else {
+ set_buffer_uptodate(bucket->bu_bhs[i]);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ }
+ }
+ }
+
+ if (rc)
+ ocfs2_xattr_bucket_relse(bucket);
+ return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+ u64 xb_blkno)
+{
+ int rc;
+
+ rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
+ bucket->bu_blocks, bucket->bu_bhs, 0,
+ NULL);
+ if (!rc) {
+ spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+ rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+ bucket->bu_bhs,
+ bucket->bu_blocks,
+ &bucket_xh(bucket)->xh_check);
+ spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+ if (rc)
+ mlog_errno(rc);
+ }
+
+ if (rc)
+ ocfs2_xattr_bucket_relse(bucket);
+ return rc;
+}
+
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket,
+ int type)
+{
+ int i, rc = 0;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ rc = ocfs2_journal_access(handle,
+ INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i], type);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int i;
+
+ spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+ ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+ bucket->bu_bhs, bucket->bu_blocks,
+ &bucket_xh(bucket)->xh_check);
+ spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
+
+ for (i = 0; i < bucket->bu_blocks; i++)
+ ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+ struct ocfs2_xattr_bucket *src)
+{
+ int i;
+ int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+ BUG_ON(dest->bu_blocks != src->bu_blocks);
+ BUG_ON(dest->bu_inode != src->bu_inode);
+
+ for (i = 0; i < src->bu_blocks; i++) {
+ memcpy(bucket_block(dest, i), bucket_block(src, i),
+ blocksize);
+ }
+}
+
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)bh->b_data;
+
+ trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+ if (rc)
+ return rc;
+
+ /*
+ * Errors after here are fatal
+ */
+
+ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has bad "
+ "signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has an "
+ "invalid xb_blkno of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid "
+ "xb_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
+ ocfs2_validate_xattr_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+ const struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+ handler = ocfs2_xattr_handler_map[name_index];
+
+ return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+ const char *name,
+ int name_len)
+{
+ /* Get hash value of uuid from super block */
+ u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+ int i;
+
+ /* hash extended attribute name */
+ for (i = 0; i < name_len; i++) {
+ hash = (hash << OCFS2_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+ *name++;
+ }
+
+ return hash;
+}
+
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+ return namevalue_size(name_len, value_len) +
+ sizeof(struct ocfs2_xattr_entry);
+}
+
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
+{
+ return namevalue_size_xi(xi) +
+ sizeof(struct ocfs2_xattr_entry);
+}
+
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+ return namevalue_size_xe(xe) +
+ sizeof(struct ocfs2_xattr_entry);
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+ struct ocfs2_security_xattr_info *si,
+ int *want_clusters,
+ int *xattr_credits,
+ struct ocfs2_alloc_context **xattr_ac)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+ si->value_len);
+
+ /*
+ * The max space of security xattr taken inline is
+ * 256(name) + 80(value) + 16(entry) = 352 bytes,
+ * So reserve one metadata block for it is ok.
+ */
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+ s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+ *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+
+ /* reserve clusters for xattr value which will be set in B tree*/
+ if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+ si->value_len);
+
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+ return ret;
+}
+
+int ocfs2_calc_xattr_init(struct inode *dir,
+ struct buffer_head *dir_bh,
+ umode_t mode,
+ struct ocfs2_security_xattr_info *si,
+ int *want_clusters,
+ int *xattr_credits,
+ int *want_meta)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+
+ if (si->enable)
+ s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+ si->value_len);
+
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+ OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+ "", NULL, 0);
+ if (acl_len > 0) {
+ a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+ if (S_ISDIR(mode))
+ a_size <<= 1;
+ } else if (acl_len != 0 && acl_len != -ENODATA) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+
+ if (!(s_size + a_size))
+ return ret;
+
+ /*
+ * The max space of security xattr taken inline is
+ * 256(name) + 80(value) + 16(entry) = 352 bytes,
+ * The max space of acl xattr taken inline is
+ * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+ * when blocksize = 512, may reserve one more cluser for
+ * xattr bucket, otherwise reserve one metadata block
+ * for them is ok.
+ * If this is a new directory with inline data,
+ * we choose to reserve the entire inline area for
+ * directory contents and force an external xattr block.
+ */
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+ (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
+ (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+ *want_meta = *want_meta + 1;
+ *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+ (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+ *want_clusters += 1;
+ *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+ }
+
+ /*
+ * reserve credits and clusters for xattrs which has large value
+ * and have to be set outside
+ */
+ if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+ si->value_len);
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+ acl_len > OCFS2_XATTR_INLINE_SIZE) {
+ /* for directory, it has DEFAULT and ACCESS two types of acls */
+ new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+ ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+ u32 clusters_to_add,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int status = 0, credits;
+ handle_t *handle = ctxt->handle;
+ enum ocfs2_alloc_restarted why;
+ u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+ while (clusters_to_add) {
+ trace_ocfs2_xattr_extend_allocation(clusters_to_add);
+
+ status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ break;
+ }
+
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(handle,
+ &et,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ ctxt->data_ac,
+ ctxt->meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ break;
+ }
+
+ ocfs2_journal_dirty(handle, vb->vb_bh);
+
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+ prev_clusters;
+
+ if (why != RESTART_NONE && clusters_to_add) {
+ /*
+ * We can only fail in case the alloc file doesn't give
+ * up enough clusters.
+ */
+ BUG_ON(why == RESTART_META);
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &vb->vb_xv->xr_list);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ break;
+ }
+ }
+ }
+
+ return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+ struct ocfs2_xattr_value_buf *vb,
+ u32 cpos, u32 phys_cpos, u32 len,
+ unsigned int ext_flags,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ handle_t *handle = ctxt->handle;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+ ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
+ &ctxt->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+ ocfs2_journal_dirty(handle, vb->vb_bh);
+
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(inode->i_sb,
+ phys_blkno),
+ len, ctxt->meta_ac, &ctxt->dealloc, 1);
+ else
+ ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+ phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+ u32 old_clusters,
+ u32 new_clusters,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret = 0;
+ unsigned int ext_flags;
+ u32 trunc_len, cpos, phys_cpos, alloc_size;
+ u64 block;
+
+ if (old_clusters <= new_clusters)
+ return 0;
+
+ cpos = new_clusters;
+ trunc_len = old_clusters - new_clusters;
+ while (trunc_len) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+ &alloc_size,
+ &vb->vb_xv->xr_list, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > trunc_len)
+ alloc_size = trunc_len;
+
+ ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
+ phys_cpos, alloc_size,
+ ext_flags, ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
+ block, alloc_size);
+ cpos += alloc_size;
+ trunc_len -= alloc_size;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+ struct ocfs2_xattr_value_buf *vb,
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+ u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+
+ if (new_clusters == old_clusters)
+ return 0;
+
+ if (new_clusters > old_clusters)
+ ret = ocfs2_xattr_extend_allocation(inode,
+ new_clusters - old_clusters,
+ vb, ctxt);
+ else
+ ret = ocfs2_xattr_shrink_size(inode,
+ old_clusters, new_clusters,
+ vb, ctxt);
+
+ return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+ size_t *result, const char *prefix,
+ const char *name, int name_len)
+{
+ char *p = buffer + *result;
+ int prefix_len = strlen(prefix);
+ int total_len = prefix_len + name_len + 1;
+
+ *result += total_len;
+
+ /* we are just looking for how big our buffer needs to be */
+ if (!size)
+ return 0;
+
+ if (*result > size)
+ return -ERANGE;
+
+ memcpy(p, prefix, prefix_len);
+ memcpy(p + prefix_len, name, name_len);
+ p[prefix_len + name_len] = '\0';
+
+ return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+ struct ocfs2_xattr_header *header,
+ char *buffer, size_t buffer_size)
+{
+ size_t result = 0;
+ int i, type, ret;
+ const char *prefix, *name;
+
+ for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+ type = ocfs2_xattr_get_type(entry);
+ prefix = ocfs2_xattr_prefix(type);
+
+ if (prefix) {
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
+
+ ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+ &result, prefix, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return result;
+}
+
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ struct ocfs2_xattr_header *xh;
+ int i;
+
+ xh = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+ if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+ return 1;
+
+ return 0;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+ struct ocfs2_dinode *di,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct ocfs2_xattr_header *header = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ int ret = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return ret;
+
+ header = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+ return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+ struct ocfs2_dinode *di,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+ ret = ocfs2_xattr_list_entries(inode, header,
+ buffer, buffer_size);
+ } else
+ ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
+ buffer, buffer_size);
+
+ brelse(blk_bh);
+
+ return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+ char *buffer,
+ size_t size)
+{
+ int ret = 0, i_ret = 0, b_ret = 0;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+ return -EOPNOTSUPP;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ return ret;
+
+ ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_read(&oi->ip_xattr_sem);
+ i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+ if (i_ret < 0)
+ b_ret = 0;
+ else {
+ if (buffer) {
+ buffer += i_ret;
+ size -= i_ret;
+ }
+ b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+ buffer, size);
+ if (b_ret < 0)
+ i_ret = 0;
+ }
+ up_read(&oi->ip_xattr_sem);
+ ocfs2_inode_unlock(dentry->d_inode, 0);
+
+ brelse(di_bh);
+
+ return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_entry *entry;
+ size_t name_len;
+ int i, cmp = 1;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ entry = xs->here;
+ for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ cmp = name_index - ocfs2_xattr_get_type(entry);
+ if (!cmp)
+ cmp = name_len - entry->xe_name_len;
+ if (!cmp)
+ cmp = memcmp(name, (xs->base +
+ le16_to_cpu(entry->xe_name_offset)),
+ name_len);
+ if (cmp == 0)
+ break;
+ entry += 1;
+ }
+ xs->here = entry;
+
+ return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ void *buffer,
+ size_t len)
+{
+ u32 cpos, p_cluster, num_clusters, bpc, clusters;
+ u64 blkno;
+ int i, ret = 0;
+ size_t cplen, blocksize;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_extent_list *el;
+
+ el = &xv->xr_list;
+ clusters = le32_to_cpu(xv->xr_clusters);
+ bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ blocksize = inode->i_sb->s_blocksize;
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, el, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ /* Copy ocfs2_xattr_value */
+ for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+ ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+ &bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cplen = len >= blocksize ? blocksize : len;
+ memcpy(buffer, bh->b_data, cplen);
+ len -= cplen;
+ buffer += cplen;
+
+ brelse(bh);
+ bh = NULL;
+ if (len == 0)
+ break;
+ }
+ cpos += num_clusters;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct ocfs2_xattr_value_root *xv;
+ size_t size;
+ int ret = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return -ENODATA;
+
+ xs->end = (void *)di + inode->i_sb->s_blocksize;
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+ xs->base = (void *)xs->header;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ if (ret)
+ return ret;
+ size = le64_to_cpu(xs->here->xe_value_size);
+ if (buffer) {
+ if (size > buffer_size)
+ return -ERANGE;
+ if (ocfs2_xattr_is_local(xs->here)) {
+ memcpy(buffer, (void *)xs->base +
+ le16_to_cpu(xs->here->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+ } else {
+ xv = (struct ocfs2_xattr_value_root *)
+ (xs->base + le16_to_cpu(
+ xs->here->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+ ret = ocfs2_xattr_get_value_outside(inode, xv,
+ buffer, size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+ }
+
+ return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_xattr_value_root *xv;
+ size_t size;
+ int ret = -ENODATA, name_offset, name_len, i;
+ int uninitialized_var(block_off);
+
+ xs->bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xs->bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ if (xs->not_found) {
+ ret = -ENODATA;
+ goto cleanup;
+ }
+
+ xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+ size = le64_to_cpu(xs->here->xe_value_size);
+ if (buffer) {
+ ret = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+
+ name_offset = le16_to_cpu(xs->here->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+ i = xs->here - xs->header->xh_entries;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(xs->bucket),
+ i,
+ &block_off,
+ &name_offset);
+ xs->base = bucket_block(xs->bucket, block_off);
+ }
+ if (ocfs2_xattr_is_local(xs->here)) {
+ memcpy(buffer, (void *)xs->base +
+ name_offset + name_len, size);
+ } else {
+ xv = (struct ocfs2_xattr_value_root *)
+ (xs->base + name_offset + name_len);
+ ret = ocfs2_xattr_get_value_outside(inode, xv,
+ buffer, size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ ret = size;
+cleanup:
+ ocfs2_xattr_bucket_free(xs->bucket);
+
+ brelse(xs->xattr_bh);
+ xs->xattr_bh = NULL;
+ return ret;
+}
+
+int ocfs2_xattr_get_nolock(struct inode *inode,
+ struct buffer_head *di_bh,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct ocfs2_dinode *di = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ ret = -ENODATA;
+
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+ buffer_size, &xis);
+ if (ret == -ENODATA && di->i_xattr_loc)
+ ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+ buffer_size, &xbs);
+
+ return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
+ ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+ name, buffer, buffer_size);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+
+ brelse(di_bh);
+
+ return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_value_buf *vb,
+ const void *value,
+ int value_len)
+{
+ int ret = 0, i, cp_len;
+ u16 blocksize = inode->i_sb->s_blocksize;
+ u32 p_cluster, num_clusters;
+ u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+ u64 blkno;
+ struct buffer_head *bh = NULL;
+ unsigned int ext_flags;
+ struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+
+ BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+ for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+ ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+ &bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle,
+ INODE_CACHE(inode),
+ bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cp_len = value_len > blocksize ? blocksize : value_len;
+ memcpy(bh->b_data, value, cp_len);
+ value_len -= cp_len;
+ value += cp_len;
+ if (cp_len < blocksize)
+ memset(bh->b_data + cp_len, 0,
+ blocksize - cp_len);
+
+ ocfs2_journal_dirty(handle, bh);
+ brelse(bh);
+ bh = NULL;
+
+ /*
+ * XXX: do we need to empty all the following
+ * blocks in this cluster?
+ */
+ if (!value_len)
+ break;
+ }
+ cpos += num_clusters;
+ }
+out:
+ brelse(bh);
+
+ return ret;
+}
+
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
+ int num_entries)
+{
+ int free_space;
+
+ if (!needed_space)
+ return 0;
+
+ free_space = free_start -
+ sizeof(struct ocfs2_xattr_header) -
+ (num_entries * sizeof(struct ocfs2_xattr_entry)) -
+ OCFS2_XATTR_HEADER_GAP;
+ if (free_space < 0)
+ return -EIO;
+ if (free_space < needed_space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
+ int type)
+{
+ return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
+
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
+{
+ loc->xl_ops->xlo_journal_dirty(handle, loc);
+}
+
+/* Give a pointer into the storage for the given offset */
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
+{
+ BUG_ON(offset >= loc->xl_size);
+ return loc->xl_ops->xlo_offset_pointer(loc, offset);
+}
+
+/*
+ * Wipe the name+value pair and allow the storage to reclaim it. This
+ * must be followed by either removal of the entry or a call to
+ * ocfs2_xa_add_namevalue().
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+ loc->xl_ops->xlo_wipe_namevalue(loc);
+}
+
+/*
+ * Find lowest offset to a name+value pair. This is the start of our
+ * downward-growing free space.
+ */
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
+{
+ return loc->xl_ops->xlo_get_free_start(loc);
+}
+
+/* Can we reuse loc->xl_entry for xi? */
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ return loc->xl_ops->xlo_can_reuse(loc, xi);
+}
+
+/* How much free space is needed to set the new value */
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ return loc->xl_ops->xlo_check_space(loc, xi);
+}
+
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ loc->xl_ops->xlo_add_entry(loc, name_hash);
+ loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+ /*
+ * We can't leave the new entry's xe_name_offset at zero or
+ * add_namevalue() will go nuts. We set it to the size of our
+ * storage so that it can never be less than any other entry.
+ */
+ loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ int size = namevalue_size_xi(xi);
+ int nameval_offset;
+ char *nameval_buf;
+
+ loc->xl_ops->xlo_add_namevalue(loc, size);
+ loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+ loc->xl_entry->xe_name_len = xi->xi_name_len;
+ ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+ ocfs2_xattr_set_local(loc->xl_entry,
+ xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+
+ nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+ memset(nameval_buf, 0, size);
+ memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+
+ /* Value bufs are for value trees */
+ BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
+ BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+ (name_size + OCFS2_XATTR_ROOT_SIZE));
+
+ loc->xl_ops->xlo_fill_value_buf(loc, vb);
+ vb->vb_xv =
+ (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+ nameval_offset +
+ name_size);
+}
+
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+ struct ocfs2_xa_loc *loc, int type)
+{
+ struct buffer_head *bh = loc->xl_storage;
+ ocfs2_journal_access_func access;
+
+ if (loc->xl_size == (bh->b_size -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_header)))
+ access = ocfs2_journal_access_xb;
+ else
+ access = ocfs2_journal_access_di;
+ return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+ struct ocfs2_xa_loc *loc)
+{
+ struct buffer_head *bh = loc->xl_storage;
+
+ ocfs2_journal_dirty(handle, bh);
+}
+
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+ int offset)
+{
+ return (char *)loc->xl_header + offset;
+}
+
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ /*
+ * Block storage is strict. If the sizes aren't exact, we will
+ * remove the old one and reinsert the new.
+ */
+ return namevalue_size_xe(loc->xl_entry) ==
+ namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ int i, count = le16_to_cpu(xh->xh_count);
+ int offset, free_start = loc->xl_size;
+
+ for (i = 0; i < count; i++) {
+ offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+ if (offset < free_start)
+ free_start = offset;
+ }
+
+ return free_start;
+}
+
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ int count = le16_to_cpu(loc->xl_header->xh_count);
+ int free_start = ocfs2_xa_get_free_start(loc);
+ int needed_space = ocfs2_xi_entry_usage(xi);
+
+ /*
+ * Block storage will reclaim the original entry before inserting
+ * the new value, so we only need the difference. If the new
+ * entry is smaller than the old one, we don't need anything.
+ */
+ if (loc->xl_entry) {
+ /* Don't need space if we're reusing! */
+ if (ocfs2_xa_can_reuse_entry(loc, xi))
+ needed_space = 0;
+ else
+ needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
+ }
+ if (needed_space < 0)
+ needed_space = 0;
+ return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted. When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+ int i, offset;
+ int namevalue_offset, first_namevalue_offset, namevalue_size;
+ struct ocfs2_xattr_entry *entry = loc->xl_entry;
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ int count = le16_to_cpu(xh->xh_count);
+
+ namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+ namevalue_size = namevalue_size_xe(entry);
+ first_namevalue_offset = ocfs2_xa_get_free_start(loc);
+
+ /* Shift the name+value pairs */
+ memmove((char *)xh + first_namevalue_offset + namevalue_size,
+ (char *)xh + first_namevalue_offset,
+ namevalue_offset - first_namevalue_offset);
+ memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+
+ /* Now tell xh->xh_entries about it */
+ for (i = 0; i < count; i++) {
+ offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+ if (offset <= namevalue_offset)
+ le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+ namevalue_size);
+ }
+
+ /*
+ * Note that we don't update xh_free_start or xh_name_value_len
+ * because they're not used in block-stored xattrs.
+ */
+}
+
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ int count = le16_to_cpu(loc->xl_header->xh_count);
+ loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+ le16_add_cpu(&loc->xl_header->xh_count, 1);
+ memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+ int free_start = ocfs2_xa_get_free_start(loc);
+
+ loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ struct buffer_head *bh = loc->xl_storage;
+
+ if (loc->xl_size == (bh->b_size -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_header)))
+ vb->vb_access = ocfs2_journal_access_xb;
+ else
+ vb->vb_access = ocfs2_journal_access_di;
+ vb->vb_bh = bh;
+}
+
+/*
+ * Operations for xattrs stored in blocks. This includes inline inode
+ * storage and unindexed ocfs2_xattr_blocks.
+ */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+ .xlo_journal_access = ocfs2_xa_block_journal_access,
+ .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
+ .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
+ .xlo_check_space = ocfs2_xa_block_check_space,
+ .xlo_can_reuse = ocfs2_xa_block_can_reuse,
+ .xlo_get_free_start = ocfs2_xa_block_get_free_start,
+ .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
+ .xlo_add_entry = ocfs2_xa_block_add_entry,
+ .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
+ .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
+};
+
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+ struct ocfs2_xa_loc *loc, int type)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+ return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
+}
+
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
+ struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+}
+
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
+ int offset)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+ int block, block_offset;
+
+ /* The header is at the front of the bucket */
+ block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+ block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
+
+ return bucket_block(bucket, block) + block_offset;
+}
+
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ return namevalue_size_xe(loc->xl_entry) >=
+ namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+ return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+ int free_start, int size)
+{
+ /*
+ * We need to make sure that the name+value pair fits within
+ * one block.
+ */
+ if (((free_start - size) >> sb->s_blocksize_bits) !=
+ ((free_start - 1) >> sb->s_blocksize_bits))
+ free_start -= free_start % sb->s_blocksize;
+
+ return free_start;
+}
+
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ int rc;
+ int count = le16_to_cpu(loc->xl_header->xh_count);
+ int free_start = ocfs2_xa_get_free_start(loc);
+ int needed_space = ocfs2_xi_entry_usage(xi);
+ int size = namevalue_size_xi(xi);
+ struct super_block *sb = loc->xl_inode->i_sb;
+
+ /*
+ * Bucket storage does not reclaim name+value pairs it cannot
+ * reuse. They live as holes until the bucket fills, and then
+ * the bucket is defragmented. However, the bucket can reclaim
+ * the ocfs2_xattr_entry.
+ */
+ if (loc->xl_entry) {
+ /* Don't need space if we're reusing! */
+ if (ocfs2_xa_can_reuse_entry(loc, xi))
+ needed_space = 0;
+ else
+ needed_space -= sizeof(struct ocfs2_xattr_entry);
+ }
+ BUG_ON(needed_space < 0);
+
+ if (free_start < size) {
+ if (needed_space)
+ return -ENOSPC;
+ } else {
+ /*
+ * First we check if it would fit in the first place.
+ * Below, we align the free start to a block. This may
+ * slide us below the minimum gap. By checking unaligned
+ * first, we avoid that error.
+ */
+ rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+ count);
+ if (rc)
+ return rc;
+ free_start = ocfs2_bucket_align_free_start(sb, free_start,
+ size);
+ }
+ return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+ le16_add_cpu(&loc->xl_header->xh_name_value_len,
+ -namevalue_size_xe(loc->xl_entry));
+}
+
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ int count = le16_to_cpu(xh->xh_count);
+ int low = 0, high = count - 1, tmp;
+ struct ocfs2_xattr_entry *tmp_xe;
+
+ /*
+ * We keep buckets sorted by name_hash, so we need to find
+ * our insert place.
+ */
+ while (low <= high && count) {
+ tmp = (low + high) / 2;
+ tmp_xe = &xh->xh_entries[tmp];
+
+ if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+ low = tmp + 1;
+ else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+ high = tmp - 1;
+ else {
+ low = tmp;
+ break;
+ }
+ }
+
+ if (low != count)
+ memmove(&xh->xh_entries[low + 1],
+ &xh->xh_entries[low],
+ ((count - low) * sizeof(struct ocfs2_xattr_entry)));
+
+ le16_add_cpu(&xh->xh_count, 1);
+ loc->xl_entry = &xh->xh_entries[low];
+ memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+ int free_start = ocfs2_xa_get_free_start(loc);
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ struct super_block *sb = loc->xl_inode->i_sb;
+ int nameval_offset;
+
+ free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+ nameval_offset = free_start - size;
+ loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+ xh->xh_free_start = cpu_to_le16(nameval_offset);
+ le16_add_cpu(&xh->xh_name_value_len, size);
+
+}
+
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+ struct super_block *sb = loc->xl_inode->i_sb;
+ int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ int size = namevalue_size_xe(loc->xl_entry);
+ int block_offset = nameval_offset >> sb->s_blocksize_bits;
+
+ /* Values are not allowed to straddle block boundaries */
+ BUG_ON(block_offset !=
+ ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+ /* We expect the bucket to be filled in */
+ BUG_ON(!bucket->bu_bhs[block_offset]);
+
+ vb->vb_access = ocfs2_journal_access;
+ vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+ .xlo_journal_access = ocfs2_xa_bucket_journal_access,
+ .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
+ .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
+ .xlo_check_space = ocfs2_xa_bucket_check_space,
+ .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
+ .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
+ .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
+ .xlo_add_entry = ocfs2_xa_bucket_add_entry,
+ .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
+ .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
+};
+
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_value_buf vb;
+
+ if (ocfs2_xattr_is_local(loc->xl_entry))
+ return 0;
+
+ ocfs2_xa_fill_value_buf(loc, &vb);
+ return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int trunc_rc, access_rc;
+ struct ocfs2_xattr_value_buf vb;
+
+ ocfs2_xa_fill_value_buf(loc, &vb);
+ trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+ ctxt);
+
+ /*
+ * The caller of ocfs2_xa_value_truncate() has already called
+ * ocfs2_xa_journal_access on the loc. However, The truncate code
+ * calls ocfs2_extend_trans(). This may commit the previous
+ * transaction and open a new one. If this is a bucket, truncate
+ * could leave only vb->vb_bh set up for journaling. Meanwhile,
+ * the caller is expecting to dirty the entire bucket. So we must
+ * reset the journal work. We do this even if truncate has failed,
+ * as it could have failed after committing the extend.
+ */
+ access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+
+ /* Errors in truncate take precedence */
+ return trunc_rc ? trunc_rc : access_rc;
+}
+
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+ int index, count;
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ struct ocfs2_xattr_entry *entry = loc->xl_entry;
+
+ ocfs2_xa_wipe_namevalue(loc);
+ loc->xl_entry = NULL;
+
+ le16_add_cpu(&xh->xh_count, -1);
+ count = le16_to_cpu(xh->xh_count);
+
+ /*
+ * Only zero out the entry if there are more remaining. This is
+ * important for an empty bucket, as it keeps track of the
+ * bucket's hash value. It doesn't hurt empty block storage.
+ */
+ if (count) {
+ index = ((char *)entry - (char *)&xh->xh_entries) /
+ sizeof(struct ocfs2_xattr_entry);
+ memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+ (count - index) * sizeof(struct ocfs2_xattr_entry));
+ memset(&xh->xh_entries[count], 0,
+ sizeof(struct ocfs2_xattr_entry));
+ }
+}
+
+/*
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state. For example, the value may be partially
+ * truncated.
+ *
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do. The caller can treat it as a straight error.
+ *
+ * If the value tree got partially truncated, we now have a corrupted
+ * extended attribute. We're going to wipe its entry and leak the
+ * clusters. Better to leak some storage than leave a corrupt entry.
+ *
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry. We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place. If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
+ */
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
+ const char *what,
+ unsigned int orig_clusters)
+{
+ unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
+ char *nameval_buf = ocfs2_xa_offset_pointer(loc,
+ le16_to_cpu(loc->xl_entry->xe_name_offset));
+
+ if (new_clusters < orig_clusters) {
+ mlog(ML_ERROR,
+ "Partial truncate while %s xattr %.*s. Leaking "
+ "%u clusters and removing the entry\n",
+ what, loc->xl_entry->xe_name_len, nameval_buf,
+ orig_clusters - new_clusters);
+ ocfs2_xa_remove_entry(loc);
+ } else if (!orig_clusters) {
+ mlog(ML_ERROR,
+ "Unable to allocate an external value for xattr "
+ "%.*s safely. Leaking %u clusters and removing the "
+ "entry\n",
+ loc->xl_entry->xe_name_len, nameval_buf,
+ new_clusters - orig_clusters);
+ ocfs2_xa_remove_entry(loc);
+ } else if (new_clusters > orig_clusters)
+ mlog(ML_ERROR,
+ "Unable to grow xattr %.*s safely. %u new clusters "
+ "have been added, but the value will not be "
+ "modified\n",
+ loc->xl_entry->xe_name_len, nameval_buf,
+ new_clusters - orig_clusters);
+}
+
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ unsigned int orig_clusters;
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ /*
+ * Since this is remove, we can return 0 if
+ * ocfs2_xa_cleanup_value_truncate() is going to
+ * wipe the entry anyway. So we check the
+ * cluster count as well.
+ */
+ if (orig_clusters != ocfs2_xa_value_clusters(loc))
+ rc = 0;
+ ocfs2_xa_cleanup_value_truncate(loc, "removing",
+ orig_clusters);
+ if (rc)
+ goto out;
+ }
+ }
+
+ ocfs2_xa_remove_entry(loc);
+
+out:
+ return rc;
+}
+
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
+{
+ int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+ char *nameval_buf;
+
+ nameval_buf = ocfs2_xa_offset_pointer(loc,
+ le16_to_cpu(loc->xl_entry->xe_name_offset));
+ memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
+
+/*
+ * Take an existing entry and make it ready for the new value. This
+ * won't allocate space, but it may free space. It should be ready for
+ * ocfs2_xa_prepare_entry() to finish the work.
+ */
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+ unsigned int orig_clusters;
+ char *nameval_buf;
+ int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
+ int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
+
+ BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+ name_size);
+
+ nameval_buf = ocfs2_xa_offset_pointer(loc,
+ le16_to_cpu(loc->xl_entry->xe_name_offset));
+ if (xe_local) {
+ memset(nameval_buf + name_size, 0,
+ namevalue_size_xe(loc->xl_entry) - name_size);
+ if (!xi_local)
+ ocfs2_xa_install_value_root(loc);
+ } else {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ if (xi_local) {
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc < 0)
+ mlog_errno(rc);
+ else
+ memset(nameval_buf + name_size, 0,
+ namevalue_size_xe(loc->xl_entry) -
+ name_size);
+ } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+ xi->xi_value_len) {
+ rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+ ctxt);
+ if (rc < 0)
+ mlog_errno(rc);
+ }
+
+ if (rc) {
+ ocfs2_xa_cleanup_value_truncate(loc, "reusing",
+ orig_clusters);
+ goto out;
+ }
+ }
+
+ loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+ ocfs2_xattr_set_local(loc->xl_entry, xi_local);
+
+out:
+ return rc;
+}
+
+/*
+ * Prepares loc->xl_entry to receive the new xattr. This includes
+ * properly setting up the name+value pair region. If loc->xl_entry
+ * already exists, it will take care of modifying it appropriately.
+ *
+ * Note that this modifies the data. You did journal_access already,
+ * right?
+ */
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ u32 name_hash,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ unsigned int orig_clusters;
+ __le64 orig_value_size = 0;
+
+ rc = ocfs2_xa_check_space(loc, xi);
+ if (rc)
+ goto out;
+
+ if (loc->xl_entry) {
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
+ goto out;
+ goto alloc_value;
+ }
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
+ }
+ }
+ ocfs2_xa_wipe_namevalue(loc);
+ } else
+ ocfs2_xa_add_entry(loc, name_hash);
+
+ /*
+ * If we get here, we have a blank entry. Fill it. We grow our
+ * name+value pair back from the end.
+ */
+ ocfs2_xa_add_namevalue(loc, xi);
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+ ocfs2_xa_install_value_root(loc);
+
+alloc_value:
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+ if (rc < 0) {
+ ctxt->set_abort = 1;
+ ocfs2_xa_cleanup_value_truncate(loc, "growing",
+ orig_clusters);
+ /*
+ * If we were growing an existing value,
+ * ocfs2_xa_cleanup_value_truncate() won't remove
+ * the entry. We need to restore the original value
+ * size.
+ */
+ if (loc->xl_entry) {
+ BUG_ON(!orig_value_size);
+ loc->xl_entry->xe_value_size = orig_value_size;
+ }
+ mlog_errno(rc);
+ }
+ }
+
+out:
+ return rc;
+}
+
+/*
+ * Store the value portion of the name+value pair. This will skip
+ * values that are stored externally. Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
+ */
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+ char *nameval_buf;
+ struct ocfs2_xattr_value_buf vb;
+
+ nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ ocfs2_xa_fill_value_buf(loc, &vb);
+ rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+ ctxt->handle, &vb,
+ xi->xi_value,
+ xi->xi_value_len);
+ } else
+ memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+
+ return rc;
+}
+
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+ xi->xi_name_len);
+
+ ret = ocfs2_xa_journal_access(ctxt->handle, loc,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * From here on out, everything is going to modify the buffer a
+ * little. Errors are going to leave the xattr header in a
+ * sane state. Thus, even with errors we dirty the sucker.
+ */
+
+ /* Don't worry, we are never called with !xi_value and !xl_entry */
+ if (!xi->xi_value) {
+ ret = ocfs2_xa_remove(loc, ctxt);
+ goto out_dirty;
+ }
+
+ ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out_dirty;
+ }
+
+ ret = ocfs2_xa_store_value(loc, xi, ctxt);
+ if (ret)
+ mlog_errno(ret);
+
+out_dirty:
+ ocfs2_xa_journal_dirty(ctxt->handle, loc);
+
+out:
+ return ret;
+}
+
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_entry *entry)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+
+ loc->xl_inode = inode;
+ loc->xl_ops = &ocfs2_xa_block_loc_ops;
+ loc->xl_storage = bh;
+ loc->xl_entry = entry;
+ loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+ loc->xl_header =
+ (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+ loc->xl_size);
+}
+
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_entry *entry)
+{
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)bh->b_data;
+
+ BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+
+ loc->xl_inode = inode;
+ loc->xl_ops = &ocfs2_xa_block_loc_ops;
+ loc->xl_storage = bh;
+ loc->xl_header = &(xb->xb_attrs.xb_header);
+ loc->xl_entry = entry;
+ loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_header);
+}
+
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_bucket *bucket,
+ struct ocfs2_xattr_entry *entry)
+{
+ loc->xl_inode = bucket->bu_inode;
+ loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+ loc->xl_storage = bucket;
+ loc->xl_header = bucket_xh(bucket);
+ loc->xl_entry = entry;
+ loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
+
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_alloc_context **meta_ac,
+ int *ref_credits)
+{
+ int ret, meta_add = 0;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+
+ *ref_credits = 0;
+ ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+ &num_clusters,
+ &xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+ ref_root_bh, xv,
+ &meta_add, ref_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ meta_add, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
+{
+ int ret = 0, i, ref_credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ void *val;
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+ for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+ if (ocfs2_xattr_is_local(entry))
+ continue;
+
+ val = (void *)header +
+ le16_to_cpu(entry->xe_name_offset);
+ vb->vb_xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+ ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+ ref_ci, ref_root_bh,
+ &ctxt.meta_ac,
+ &ref_credits);
+
+ ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+ ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac) {
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ctxt.meta_ac = NULL;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ }
+
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
+ return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
+{
+
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_xattr_header *header;
+ int ret;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = di_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ header = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ ret = ocfs2_remove_value_outside(inode, &vb, header,
+ ref_ci, ref_root_bh);
+
+ return ret;
+}
+
+struct ocfs2_rm_xattr_bucket_para {
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+};
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+ struct buffer_head *blk_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
+{
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+ struct ocfs2_rm_xattr_bucket_para args = {
+ .ref_ci = ref_ci,
+ .ref_root_bh = ref_root_bh,
+ };
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header,
+ ref_ci, ref_root_bh);
+ } else
+ ret = ocfs2_iterate_xattr_index_block(inode,
+ blk_bh,
+ ocfs2_rm_xattr_cluster,
+ &args);
+
+ return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+ u64 block,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
+{
+ struct inode *xb_alloc_inode;
+ struct buffer_head *xb_alloc_bh = NULL;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle;
+ int ret = 0;
+ u64 blk, bg_blkno;
+ u16 bit;
+
+ ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ blk = le64_to_cpu(xb->xb_blkno);
+ bit = le16_to_cpu(xb->xb_suballoc_bit);
+ if (xb->xb_suballoc_loc)
+ bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+ xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(xb->xb_suballoc_slot));
+ if (!xb_alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&xb_alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+ bit, bg_blkno, 1);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ ocfs2_inode_unlock(xb_alloc_inode, 1);
+ brelse(xb_alloc_bh);
+out_mutex:
+ mutex_unlock(&xb_alloc_inode->i_mutex);
+ iput(xb_alloc_inode);
+out:
+ brelse(blk_bh);
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_caching_info *ref_ci = NULL;
+ handle_t *handle;
+ int ret;
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ return 0;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+ ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ref_ci = &ref_tree->rf_ci;
+
+ }
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+ ref_ci, ref_root_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (di->i_xattr_loc) {
+ ret = ocfs2_xattr_free_block(inode,
+ le64_to_cpu(di->i_xattr_loc),
+ ref_ci, ref_root_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ di->i_xattr_loc = 0;
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+ ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+ brelse(ref_root_bh);
+ return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+ int free;
+
+ if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+ return 0;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+ } else if (ocfs2_inode_is_fast_symlink(inode)) {
+ free = ocfs2_fast_symlink_chars(inode->i_sb) -
+ le64_to_cpu(di->i_size);
+ } else {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ free = (le16_to_cpu(el->l_count) -
+ le16_to_cpu(el->l_next_free_rec)) *
+ sizeof(struct ocfs2_extent_rec);
+ }
+ if (free >= xattrsize)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ int ret;
+ int has_space = 0;
+
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+ return 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ down_read(&oi->ip_alloc_sem);
+ has_space = ocfs2_xattr_has_space_inline(inode, di);
+ up_read(&oi->ip_alloc_sem);
+ if (!has_space)
+ return 0;
+ }
+
+ xs->xattr_bh = xs->inode_bh;
+ xs->end = (void *)di + inode->i_sb->s_blocksize;
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+ else
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+ xs->base = (void *)xs->header;
+ xs->here = xs->header->xh_entries;
+
+ /* Find the named attribute. */
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ if (ret && ret != -ENODATA)
+ return ret;
+ xs->not_found = ret;
+ }
+
+ return 0;
+}
+
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int xattrsize = osb->s_xattr_inline_size;
+
+ if (!ocfs2_xattr_has_space_inline(inode, di)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Adjust extent record count or inline data size
+ * to reserve space for extended attribute.
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ le16_add_cpu(&idata->id_count, -xattrsize);
+ } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(xattrsize /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ ocfs2_journal_dirty(ctxt->handle, di_bh);
+
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_xa_loc loc;
+
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+ return -ENOSPC;
+
+ down_write(&oi->ip_alloc_sem);
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+ xs->not_found ? NULL : xs->here);
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ xs->here = loc.xl_entry;
+
+out:
+ up_write(&oi->ip_alloc_sem);
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ xs->header = &xb->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ } else
+ ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+ name_index,
+ name, xs);
+
+ if (ret && ret != -ENODATA) {
+ xs->xattr_bh = NULL;
+ goto cleanup;
+ }
+ xs->not_found = ret;
+ return 0;
+cleanup:
+ brelse(blk_bh);
+
+ return ret;
+}
+
+static int ocfs2_create_xattr_block(struct inode *inode,
+ struct buffer_head *inode_bh,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ int indexed,
+ struct buffer_head **ret_bh)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 suballoc_loc, first_blkno;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_xattr_block *xblk;
+
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+ inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+ &suballoc_loc, &suballoc_bit_start,
+ &num_got, &first_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto end;
+ }
+
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+ ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
+ new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ /* Initialize ocfs2_xattr_block */
+ xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+ memset(xblk, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+ xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+ xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
+ xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ xblk->xb_fs_generation =
+ cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
+ xblk->xb_blkno = cpu_to_le64(first_blkno);
+ if (indexed) {
+ struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+ xr->xt_clusters = cpu_to_le32(1);
+ xr->xt_last_eb_blk = 0;
+ xr->xt_list.l_tree_depth = 0;
+ xr->xt_list.l_count = cpu_to_le16(
+ ocfs2_xattr_recs_per_xb(inode->i_sb));
+ xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+ xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+ }
+ ocfs2_journal_dirty(ctxt->handle, new_bh);
+
+ /* Add it to the inode */
+ di->i_xattr_loc = cpu_to_le64(first_blkno);
+
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+ di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+ ocfs2_journal_dirty(ctxt->handle, inode_bh);
+
+ *ret_bh = new_bh;
+ new_bh = NULL;
+
+end:
+ brelse(new_bh);
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_xattr_block *xblk = NULL;
+ int ret;
+ struct ocfs2_xa_loc loc;
+
+ if (!xs->xattr_bh) {
+ ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
+ 0, &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ xs->xattr_bh = new_bh;
+ xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+ xs->header = &xblk->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+ xs->here = xs->header->xh_entries;
+ } else
+ xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+ if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
+ xs->not_found ? NULL : xs->here);
+
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (!ret)
+ xs->here = loc.xl_entry;
+ else if ((ret != -ENOSPC) || ctxt->set_abort)
+ goto end;
+ else {
+ ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+ if (ret)
+ goto end;
+ }
+ }
+
+ if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+ ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+
+end:
+ return ret;
+}
+
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_entry *last;
+ int free, i;
+ size_t min_offs = xs->end - xs->base;
+
+ if (!xs->header)
+ return 0;
+
+ last = xs->header->xh_entries;
+
+ for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t offs = le16_to_cpu(last->xe_name_offset);
+ if (offs < min_offs)
+ min_offs = offs;
+ last += 1;
+ }
+
+ free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
+ if (free < 0)
+ return 0;
+
+ BUG_ON(!xs->not_found);
+
+ if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
+ return 1;
+
+ return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ int *clusters_need,
+ int *meta_need,
+ int *credits_need)
+{
+ int ret = 0, old_in_xb = 0;
+ int clusters_add = 0, meta_add = 0, credits = 0;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_xattr_block *xb = NULL;
+ struct ocfs2_xattr_entry *xe = NULL;
+ struct ocfs2_xattr_value_root *xv = NULL;
+ char *base = NULL;
+ int name_offset, name_len = 0;
+ u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+ xi->xi_value_len);
+ u64 value_size;
+
+ /*
+ * Calculate the clusters we need to write.
+ * No matter whether we replace an old one or add a new one,
+ * we need this for writing.
+ */
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+ credits += new_clusters *
+ ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+ if (xis->not_found && xbs->not_found) {
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ clusters_add += new_clusters;
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ &def_xv.xv.xr_list);
+ }
+
+ goto meta_guess;
+ }
+
+ if (!xis->not_found) {
+ xe = xis->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ base = xis->base;
+ credits += OCFS2_INODE_UPDATE_CREDITS;
+ } else {
+ int i, block_off = 0;
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+ xe = xbs->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ i = xbs->here - xbs->header->xh_entries;
+ old_in_xb = 1;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(xbs->bucket),
+ i, &block_off,
+ &name_offset);
+ base = bucket_block(xbs->bucket, block_off);
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ } else {
+ base = xbs->base;
+ credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+ }
+ }
+
+ /*
+ * delete a xattr doesn't need metadata and cluster allocation.
+ * so just calculate the credits and return.
+ *
+ * The credits for removing the value tree will be extended
+ * by ocfs2_remove_extent itself.
+ */
+ if (!xi->xi_value) {
+ if (!ocfs2_xattr_is_local(xe))
+ credits += ocfs2_remove_extent_credits(inode->i_sb);
+
+ goto out;
+ }
+
+ /* do cluster allocation guess first. */
+ value_size = le64_to_cpu(xe->xe_value_size);
+
+ if (old_in_xb) {
+ /*
+ * In xattr set, we always try to set the xe in inode first,
+ * so if it can be inserted into inode successfully, the old
+ * one will be removed from the xattr block, and this xattr
+ * will be inserted into inode as a new xattr in inode.
+ */
+ if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+ clusters_add += new_clusters;
+ credits += ocfs2_remove_extent_credits(inode->i_sb) +
+ OCFS2_INODE_UPDATE_CREDITS;
+ if (!ocfs2_xattr_is_local(xe))
+ credits += ocfs2_calc_extend_credits(
+ inode->i_sb,
+ &def_xv.xv.xr_list);
+ goto out;
+ }
+ }
+
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /* the new values will be stored outside. */
+ u32 old_clusters = 0;
+
+ if (!ocfs2_xattr_is_local(xe)) {
+ old_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+ value_size);
+ xv = (struct ocfs2_xattr_value_root *)
+ (base + name_offset + name_len);
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ } else
+ xv = &def_xv.xv;
+
+ if (old_clusters >= new_clusters) {
+ credits += ocfs2_remove_extent_credits(inode->i_sb);
+ goto out;
+ } else {
+ meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+ clusters_add += new_clusters - old_clusters;
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ &xv->xr_list);
+ if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+ goto out;
+ }
+ } else {
+ /*
+ * Now the new value will be stored inside. So if the new
+ * value is smaller than the size of value root or the old
+ * value, we don't need any allocation, otherwise we have
+ * to guess metadata allocation.
+ */
+ if ((ocfs2_xattr_is_local(xe) &&
+ (value_size >= xi->xi_value_len)) ||
+ (!ocfs2_xattr_is_local(xe) &&
+ OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
+ goto out;
+ }
+
+meta_guess:
+ /* calculate metadata allocation. */
+ if (di->i_xattr_loc) {
+ if (!xbs->xattr_bh) {
+ ret = ocfs2_read_xattr_block(inode,
+ le64_to_cpu(di->i_xattr_loc),
+ &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xb = (struct ocfs2_xattr_block *)bh->b_data;
+ } else
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+ /*
+ * If there is already an xattr tree, good, we can calculate
+ * like other b-trees. Otherwise we may have the chance of
+ * create a tree, the credit calculation is borrowed from
+ * ocfs2_calc_extend_credits with root_el = NULL. And the
+ * new tree will be cluster based, so no meta is needed.
+ */
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ struct ocfs2_extent_list *el =
+ &xb->xb_attrs.xb_root.xt_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el);
+ } else
+ credits += OCFS2_SUBALLOC_ALLOC + 1;
+
+ /*
+ * This cluster will be used either for new bucket or for
+ * new xattr block.
+ * If the cluster size is the same as the bucket size, one
+ * more is needed since we may need to extend the bucket
+ * also.
+ */
+ clusters_add += 1;
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ if (OCFS2_XATTR_BUCKET_SIZE ==
+ OCFS2_SB(inode->i_sb)->s_clustersize) {
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ clusters_add += 1;
+ }
+ } else {
+ credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el);
+ } else {
+ meta_add += 1;
+ }
+ }
+out:
+ if (clusters_need)
+ *clusters_need = clusters_add;
+ if (meta_need)
+ *meta_need = meta_add;
+ if (credits_need)
+ *credits_need = credits;
+ brelse(bh);
+ return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ int extra_meta,
+ int *credits)
+{
+ int clusters_add, meta_add, ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+ ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+ ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+ &clusters_add, &meta_add, credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ meta_add += extra_meta;
+ trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+ clusters_add, *credits);
+
+ if (meta_add) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+ &ctxt->meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (clusters_add) {
+ ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ if (ret) {
+ if (ctxt->meta_ac) {
+ ocfs2_free_alloc_context(ctxt->meta_ac);
+ ctxt->meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null ctxt->data_ac.
+ */
+ }
+
+ return ret;
+}
+
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret = 0, credits, old_found;
+
+ if (!xi->xi_value) {
+ /* Remove existing extended attribute */
+ if (!xis->not_found)
+ ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+ else if (!xbs->not_found)
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ } else {
+ /* We always try to set extended attribute into inode first*/
+ ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+ if (!ret && !xbs->not_found) {
+ /*
+ * If succeed and that extended attribute existing in
+ * external block, then we will remove it.
+ */
+ xi->xi_value = NULL;
+ xi->xi_value_len = 0;
+
+ old_found = xis->not_found;
+ xis->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ xis->not_found = old_found;
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
+ if (di->i_xattr_loc && !xbs->xattr_bh) {
+ ret = ocfs2_xattr_block_find(inode,
+ xi->xi_name_index,
+ xi->xi_name, xbs);
+ if (ret)
+ goto out;
+
+ old_found = xis->not_found;
+ xis->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ xis->not_found = old_found;
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ /*
+ * If no space in inode, we will set extended attribute
+ * into external block.
+ */
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ if (ret)
+ goto out;
+ if (!xis->not_found) {
+ /*
+ * If succeed and that extended attribute
+ * existing in inode, we will remove it.
+ */
+ xi->xi_value = NULL;
+ xi->xi_value_len = 0;
+ xbs->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_ibody_set(inode, xi,
+ xis, ctxt);
+ }
+ }
+ }
+
+ if (!ret) {
+ /* Update inode ctime. */
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+ xis->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+ }
+out:
+ return ret;
+}
+
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int name_index,
+ const char *name,
+ const void *value,
+ size_t value_len,
+ int flags,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_dinode *di;
+ int ret;
+
+ struct ocfs2_xattr_info xi = {
+ .xi_name_index = name_index,
+ .xi_name = name,
+ .xi_name_len = strlen(name),
+ .xi_value = value,
+ .xi_value_len = value_len,
+ };
+
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_set_ctxt ctxt = {
+ .handle = handle,
+ .meta_ac = meta_ac,
+ .data_ac = data_ac,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ /*
+ * In extreme situation, may need xattr bucket when
+ * block size is too small. And we have already reserved
+ * the credits for bucket in mknod.
+ */
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+ xbs.bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xbs.bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+ }
+
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+ if (ret)
+ goto cleanup;
+ if (xis.not_found) {
+ ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+
+ ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ brelse(xbs.xattr_bh);
+ ocfs2_xattr_bucket_free(xbs.bucket);
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+ int name_index,
+ const char *name,
+ const void *value,
+ size_t value_len,
+ int flags)
+{
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ int ret, credits, ref_meta = 0, ref_credits = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+
+ struct ocfs2_xattr_info xi = {
+ .xi_name_index = name_index,
+ .xi_name = name,
+ .xi_name_len = strlen(name),
+ .xi_value = value,
+ .xi_value_len = value_len,
+ };
+
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ /*
+ * Only xbs will be used on indexed trees. xis doesn't need a
+ * bucket.
+ */
+ xbs.bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xbs.bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto cleanup_nolock;
+ }
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+ /*
+ * Scan inode and external block to find the same name
+ * extended attribute and collect search information.
+ */
+ ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+ if (ret)
+ goto cleanup;
+ if (xis.not_found) {
+ ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (xis.not_found && xbs.not_found) {
+ ret = -ENODATA;
+ if (flags & XATTR_REPLACE)
+ goto cleanup;
+ ret = 0;
+ if (!value)
+ goto cleanup;
+ } else {
+ ret = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto cleanup;
+ }
+
+ /* Check whether the value is refcounted and do some preparation. */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+ (!xis.not_found || !xbs.not_found)) {
+ ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+ &xis, &xbs, &ref_tree,
+ &ref_meta, &ref_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mutex_unlock(&tl_inode->i_mutex);
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+ &xbs, &ctxt, ref_meta, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ /* we need to update inode's ctime field, so add credit for it. */
+ credits += OCFS2_INODE_UPDATE_CREDITS;
+ ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto out_free_ac;
+ }
+
+ ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+ ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+
+out_free_ac:
+ if (ctxt.data_ac)
+ ocfs2_free_alloc_context(ctxt.data_ac);
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
+cleanup:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ if (!value && !ret) {
+ ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+ }
+ ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
+ brelse(di_bh);
+ brelse(xbs.xattr_bh);
+ ocfs2_xattr_bucket_free(xbs.bucket);
+
+ return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+ u32 name_hash,
+ u64 *p_blkno,
+ u32 *e_cpos,
+ u32 *num_clusters,
+ struct ocfs2_extent_list *el)
+{
+ int ret = 0, i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+ u64 e_blkno = 0;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
+ &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "xattr tree block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+ e_blkno = le64_to_cpu(rec->e_blkno);
+ break;
+ }
+ }
+
+ if (!e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0) in xattr", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ *p_blkno = le64_to_cpu(rec->e_blkno);
+ *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ if (e_cpos)
+ *e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ int name_index,
+ const char *name,
+ u32 name_hash,
+ u16 *xe_index,
+ int *found)
+{
+ int i, ret = 0, cmp = 1, block_off, new_offset;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ size_t name_len = strlen(name);
+ struct ocfs2_xattr_entry *xe = NULL;
+ char *xe_name;
+
+ /*
+ * We don't use binary search in the bucket because there
+ * may be multiple entries with the same name hash.
+ */
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+
+ if (name_hash > le32_to_cpu(xe->xe_name_hash))
+ continue;
+ else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+ break;
+
+ cmp = name_index - ocfs2_xattr_get_type(xe);
+ if (!cmp)
+ cmp = name_len - xe->xe_name_len;
+ if (cmp)
+ continue;
+
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ xh,
+ i,
+ &block_off,
+ &new_offset);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+
+ xe_name = bucket_block(bucket, block_off) + new_offset;
+ if (!memcmp(name, xe_name, name_len)) {
+ *xe_index = i;
+ *found = 1;
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ u32 name_hash,
+ u64 p_blkno,
+ u32 first_hash,
+ u32 num_clusters,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, found = 0;
+ struct ocfs2_xattr_header *xh = NULL;
+ struct ocfs2_xattr_entry *xe = NULL;
+ u16 index = 0;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int low_bucket = 0, bucket, high_bucket;
+ struct ocfs2_xattr_bucket *search;
+ u32 last_hash;
+ u64 blkno, lower_blkno = 0;
+
+ search = ocfs2_xattr_bucket_new(inode);
+ if (!search) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(search, p_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = bucket_xh(search);
+ high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+ while (low_bucket <= high_bucket) {
+ ocfs2_xattr_bucket_relse(search);
+
+ bucket = (low_bucket + high_bucket) / 2;
+ blkno = p_blkno + bucket * blk_per_bucket;
+ ret = ocfs2_read_xattr_bucket(search, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = bucket_xh(search);
+ xe = &xh->xh_entries[0];
+ if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+ high_bucket = bucket - 1;
+ continue;
+ }
+
+ /*
+ * Check whether the hash of the last entry in our
+ * bucket is larger than the search one. for an empty
+ * bucket, the last one is also the first one.
+ */
+ if (xh->xh_count)
+ xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+ last_hash = le32_to_cpu(xe->xe_name_hash);
+
+ /* record lower_blkno which may be the insert place. */
+ lower_blkno = blkno;
+
+ if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+ low_bucket = bucket + 1;
+ continue;
+ }
+
+ /* the searched xattr should reside in this bucket if exists. */
+ ret = ocfs2_find_xe_in_bucket(inode, search,
+ name_index, name, name_hash,
+ &index, &found);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * Record the bucket we have found.
+ * When the xattr's hash value is in the gap of 2 buckets, we will
+ * always set it to the previous bucket.
+ */
+ if (!lower_blkno)
+ lower_blkno = p_blkno;
+
+ /* This should be in cache - we just read it during the search */
+ ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xs->header = bucket_xh(xs->bucket);
+ xs->base = bucket_block(xs->bucket, 0);
+ xs->end = xs->base + inode->i_sb->s_blocksize;
+
+ if (found) {
+ xs->here = &xs->header->xh_entries[index];
+ trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+ name, name_index, name_hash,
+ (unsigned long long)bucket_blkno(xs->bucket),
+ index);
+ } else
+ ret = -ENODATA;
+
+out:
+ ocfs2_xattr_bucket_free(search);
+ return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+ struct buffer_head *root_bh,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)root_bh->b_data;
+ struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+ struct ocfs2_extent_list *el = &xb_root->xt_list;
+ u64 p_blkno = 0;
+ u32 first_hash, num_clusters = 0;
+ u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return -ENODATA;
+
+ trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+ name, name_index, name_hash,
+ (unsigned long long)root_bh->b_blocknr,
+ -1);
+
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+ trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+ name, name_index, first_hash,
+ (unsigned long long)p_blkno,
+ num_clusters);
+
+ ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+ p_blkno, first_hash, num_clusters, xs);
+
+out:
+ return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+ u64 blkno,
+ u32 clusters,
+ xattr_bucket_func *func,
+ void *para)
+{
+ int i, ret = 0;
+ u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+ u32 num_buckets = clusters * bpc;
+ struct ocfs2_xattr_bucket *bucket;
+
+ bucket = ocfs2_xattr_bucket_new(inode);
+ if (!bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ trace_ocfs2_iterate_xattr_buckets(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno, clusters);
+
+ for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+ ret = ocfs2_read_xattr_bucket(bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * The real bucket num in this series of blocks is stored
+ * in the 1st bucket.
+ */
+ if (i == 0)
+ num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
+
+ trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
+ le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
+ if (func) {
+ ret = func(inode, bucket, para);
+ if (ret && ret != -ERANGE)
+ mlog_errno(ret);
+ /* Fall through to bucket_relse() */
+ }
+
+ ocfs2_xattr_bucket_relse(bucket);
+ if (ret)
+ break;
+ }
+
+ ocfs2_xattr_bucket_free(bucket);
+ return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+ char *buffer;
+ size_t buffer_size;
+ size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
+ struct ocfs2_xattr_header *xh,
+ int index,
+ int *block_off,
+ int *new_offset)
+{
+ u16 name_offset;
+
+ if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+ return -EINVAL;
+
+ name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+ *block_off = name_offset >> sb->s_blocksize_bits;
+ *new_offset = name_offset % sb->s_blocksize;
+
+ return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0, type;
+ struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+ int i, block_off, new_offset;
+ const char *prefix, *name;
+
+ for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
+ type = ocfs2_xattr_get_type(entry);
+ prefix = ocfs2_xattr_prefix(type);
+
+ if (prefix) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(bucket),
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
+
+ name = (const char *)bucket_block(bucket, block_off) +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ prefix, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ xattr_tree_rec_func *rec_func,
+ void *para)
+{
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+ int ret = 0;
+ u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+ u64 p_blkno = 0;
+
+ if (!el->l_next_free_rec || !rec_func)
+ return 0;
+
+ while (name_hash > 0) {
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+ &e_cpos, &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+ num_clusters, para);
+ if (ret) {
+ if (ret != -ERANGE)
+ mlog_errno(ret);
+ break;
+ }
+
+ if (e_cpos == 0)
+ break;
+
+ name_hash = e_cpos - 1;
+ }
+
+ return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para)
+{
+ return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ char *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct ocfs2_xattr_tree_list xl = {
+ .buffer = buffer,
+ .buffer_size = buffer_size,
+ .result = 0,
+ };
+
+ ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+ ocfs2_list_xattr_tree_rec, &xl);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = xl.result;
+out:
+ return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+ const struct ocfs2_xattr_entry *l = a, *r = b;
+ u32 l_hash = le32_to_cpu(l->xe_name_hash);
+ u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+ if (l_hash > r_hash)
+ return 1;
+ if (l_hash < r_hash)
+ return -1;
+ return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+ struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+ tmp = *l;
+ memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+ memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end. This is why *target starts as the last buffer.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+ struct buffer_head *xb_bh,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int i, blocksize = inode->i_sb->s_blocksize;
+ int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u16 offset, size, off_change;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ u16 count = le16_to_cpu(xb_xh->xh_count);
+ char *src = xb_bh->b_data;
+ char *target = bucket_block(bucket, blks - 1);
+
+ trace_ocfs2_cp_xattr_block_to_bucket_begin(
+ (unsigned long long)xb_bh->b_blocknr,
+ (unsigned long long)bucket_blkno(bucket));
+
+ for (i = 0; i < blks; i++)
+ memset(bucket_block(bucket, i), 0, blocksize);
+
+ /*
+ * Since the xe_name_offset is based on ocfs2_xattr_header,
+ * there is a offset change corresponding to the change of
+ * ocfs2_xattr_header's position.
+ */
+ off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ xe = &xb_xh->xh_entries[count - 1];
+ offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+ size = blocksize - offset;
+
+ /* copy all the names and values. */
+ memcpy(target + offset, src + offset, size);
+
+ /* Init new header now. */
+ xh->xh_count = xb_xh->xh_count;
+ xh->xh_num_buckets = cpu_to_le16(1);
+ xh->xh_name_value_len = cpu_to_le16(size);
+ xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+ /* copy all the entries. */
+ target = bucket_block(bucket, 0);
+ offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+ size = count * sizeof(struct ocfs2_xattr_entry);
+ memcpy(target + offset, (char *)xb_xh + offset, size);
+
+ /* Change the xe offset for all the xe because of the move. */
+ off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+ offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ for (i = 0; i < count; i++)
+ le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+ trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
+
+ sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+ cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head *old_bh)
+{
+ char *buf = old_bh->b_data;
+ struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+ struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+ int i;
+
+ xs->header = bucket_xh(xs->bucket);
+ xs->base = bucket_block(xs->bucket, 0);
+ xs->end = xs->base + inode->i_sb->s_blocksize;
+
+ if (xs->not_found)
+ return;
+
+ i = xs->here - old_xh->xh_entries;
+ xs->here = &xs->header->xh_entries[i];
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ u32 bit_off, len;
+ u64 blkno;
+ handle_t *handle = ctxt->handle;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *xb_bh = xs->xattr_bh;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_tree_root *xr;
+ u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+ trace_ocfs2_xattr_create_index_block_begin(
+ (unsigned long long)xb_bh->b_blocknr);
+
+ BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+ BUG_ON(!xs->bucket);
+
+ /*
+ * XXX:
+ * We can use this lock for now, and maybe move to a dedicated mutex
+ * if performance becomes a problem later.
+ */
+ down_write(&oi->ip_alloc_sem);
+
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
+ 1, 1, &bit_off, &len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * The bucket may spread in many blocks, and
+ * we will only touch the 1st block and the last block
+ * in the whole bucket(one for entry and one for data).
+ */
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+ trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
+
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
+
+ ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
+
+ /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+ xr = &xb->xb_attrs.xb_root;
+ xr->xt_clusters = cpu_to_le32(1);
+ xr->xt_last_eb_blk = 0;
+ xr->xt_list.l_tree_depth = 0;
+ xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+ xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+ xr->xt_list.l_recs[0].e_cpos = 0;
+ xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+ xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+ xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+ ocfs2_journal_dirty(handle, xb_bh);
+
+out:
+ up_write(&oi->ip_alloc_sem);
+
+ return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+ const struct ocfs2_xattr_entry *l = a, *r = b;
+ u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+ u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+ if (l_name_offset < r_name_offset)
+ return 1;
+ if (l_name_offset > r_name_offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int ret, i;
+ size_t end, offset, len;
+ struct ocfs2_xattr_header *xh;
+ char *entries, *buf, *bucket_buf = NULL;
+ u64 blkno = bucket_blkno(bucket);
+ u16 xh_free_start;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ struct ocfs2_xattr_entry *xe;
+
+ /*
+ * In order to make the operation more efficient and generic,
+ * we copy all the blocks into a contiguous memory and do the
+ * defragment there, so if anything is error, we will not touch
+ * the real block.
+ */
+ bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+ if (!bucket_buf) {
+ ret = -EIO;
+ goto out;
+ }
+
+ buf = bucket_buf;
+ for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+ memcpy(buf, bucket_block(bucket, i), blocksize);
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bucket_buf;
+ entries = (char *)xh->xh_entries;
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+ trace_ocfs2_defrag_xattr_bucket(
+ (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
+ xh_free_start, le16_to_cpu(xh->xh_name_value_len));
+
+ /*
+ * sort all the entries by their offset.
+ * the largest will be the first, so that we can
+ * move them to the end one by one.
+ */
+ sort(entries, le16_to_cpu(xh->xh_count),
+ sizeof(struct ocfs2_xattr_entry),
+ cmp_xe_offset, swap_xe);
+
+ /* Move all name/values to the end of the bucket. */
+ xe = xh->xh_entries;
+ end = OCFS2_XATTR_BUCKET_SIZE;
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+ offset = le16_to_cpu(xe->xe_name_offset);
+ len = namevalue_size_xe(xe);
+
+ /*
+ * We must make sure that the name/value pair
+ * exist in the same block. So adjust end to
+ * the previous block end if needed.
+ */
+ if (((end - len) / blocksize !=
+ (end - 1) / blocksize))
+ end = end - end % blocksize;
+
+ if (end > offset + len) {
+ memmove(bucket_buf + end - len,
+ bucket_buf + offset, len);
+ xe->xe_name_offset = cpu_to_le16(end - len);
+ }
+
+ mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+ "bucket %llu\n", (unsigned long long)blkno);
+
+ end -= len;
+ }
+
+ mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+ "bucket %llu\n", (unsigned long long)blkno);
+
+ if (xh_free_start == end)
+ goto out;
+
+ memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+ xh->xh_free_start = cpu_to_le16(end);
+
+ /* sort the entries by their name_hash. */
+ sort(entries, le16_to_cpu(xh->xh_count),
+ sizeof(struct ocfs2_xattr_entry),
+ cmp_xe, swap_xe);
+
+ buf = bucket_buf;
+ for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+ memcpy(bucket_block(bucket, i), buf, blocksize);
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+out:
+ kfree(bucket_buf);
+ return ret;
+}
+
+/*
+ * prev_blkno points to the start of an existing extent. new_blkno
+ * points to a newly allocated extent. Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary. So we take the last cluster of the existing
+ * extent and split it down the middle. We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count. header_bh is the bucket were we were hoping
+ * to insert our xattr. If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
+ *
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
+ u64 new_blkno,
+ u32 num_clusters,
+ u32 *first_hash)
+{
+ int ret;
+ struct super_block *sb = inode->i_sb;
+ int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+ int to_move = num_buckets / 2;
+ u64 src_blkno;
+ u64 last_cluster_blkno = bucket_blkno(first) +
+ ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
+
+ BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
+
+ trace_ocfs2_mv_xattr_bucket_cross_cluster(
+ (unsigned long long)last_cluster_blkno,
+ (unsigned long long)new_blkno);
+
+ ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
+ last_cluster_blkno, new_blkno,
+ to_move, first_hash);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* This is the first bucket that got moved */
+ src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
+
+ /*
+ * If the target bucket was part of the moved buckets, we need to
+ * update first and target.
+ */
+ if (bucket_blkno(target) >= src_blkno) {
+ /* Find the block for the new target bucket */
+ src_blkno = new_blkno +
+ (bucket_blkno(target) - src_blkno);
+
+ ocfs2_xattr_bucket_relse(first);
+ ocfs2_xattr_bucket_relse(target);
+
+ /*
+ * These shouldn't fail - the buffers are in the
+ * journal from ocfs2_cp_xattr_bucket().
+ */
+ ret = ocfs2_read_xattr_bucket(first, new_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_read_xattr_bucket(target, src_blkno);
+ if (ret)
+ mlog_errno(ret);
+
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes. Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+ struct ocfs2_xattr_entry *entries = xh->xh_entries;
+ int count = le16_to_cpu(xh->xh_count);
+ int delta, middle = count / 2;
+
+ /*
+ * We start at the middle. Each step gets farther away in both
+ * directions. We therefore hit the change in hash value
+ * nearest to the middle. Note that this loop does not execute for
+ * count < 2.
+ */
+ for (delta = 0; delta < middle; delta++) {
+ /* Let's check delta earlier than middle */
+ if (cmp_xe(&entries[middle - delta - 1],
+ &entries[middle - delta]))
+ return middle - delta;
+
+ /* For even counts, don't walk off the end */
+ if ((middle + delta + 1) == count)
+ continue;
+
+ /* Now try delta past middle */
+ if (cmp_xe(&entries[middle + delta],
+ &entries[middle + delta + 1]))
+ return middle + delta + 1;
+ }
+
+ /* Every entry had the same hash */
+ return count;
+}
+
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved. But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
+ */
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ u64 blk,
+ u64 new_blk,
+ u32 *first_hash,
+ int new_bucket_head)
+{
+ int ret, i;
+ int count, start, len, name_value_len = 0, name_offset = 0;
+ struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
+ struct ocfs2_xattr_header *xh;
+ struct ocfs2_xattr_entry *xe;
+ int blocksize = inode->i_sb->s_blocksize;
+
+ trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+ (unsigned long long)new_blk);
+
+ s_bucket = ocfs2_xattr_bucket_new(inode);
+ t_bucket = ocfs2_xattr_bucket_new(inode);
+ if (!s_bucket || !t_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(s_bucket, blk);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
+ * there's no need to read it.
+ */
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Hey, if we're overwriting t_bucket, what difference does
+ * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the
+ * same part of ocfs2_cp_xattr_bucket().
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+ new_bucket_head ?
+ OCFS2_JOURNAL_ACCESS_CREATE :
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = bucket_xh(s_bucket);
+ count = le16_to_cpu(xh->xh_count);
+ start = ocfs2_xattr_find_divide_pos(xh);
+
+ if (start == count) {
+ xe = &xh->xh_entries[start-1];
+
+ /*
+ * initialized a new empty bucket here.
+ * The hash value is set as one larger than
+ * that of the last entry in the previous bucket.
+ */
+ for (i = 0; i < t_bucket->bu_blocks; i++)
+ memset(bucket_block(t_bucket, i), 0, blocksize);
+
+ xh = bucket_xh(t_bucket);
+ xh->xh_free_start = cpu_to_le16(blocksize);
+ xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+ le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+
+ goto set_num_buckets;
+ }
+
+ /* copy the whole bucket to the new first. */
+ ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+
+ /* update the new bucket. */
+ xh = bucket_xh(t_bucket);
+
+ /*
+ * Calculate the total name/value len and xh_free_start for
+ * the old bucket first.
+ */
+ name_offset = OCFS2_XATTR_BUCKET_SIZE;
+ name_value_len = 0;
+ for (i = 0; i < start; i++) {
+ xe = &xh->xh_entries[i];
+ name_value_len += namevalue_size_xe(xe);
+ if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ }
+
+ /*
+ * Now begin the modification to the new bucket.
+ *
+ * In the new bucket, We just move the xattr entry to the beginning
+ * and don't touch the name/value. So there will be some holes in the
+ * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+ * called.
+ */
+ xe = &xh->xh_entries[start];
+ len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+ trace_ocfs2_divide_xattr_bucket_move(len,
+ (int)((char *)xe - (char *)xh),
+ (int)((char *)xh->xh_entries - (char *)xh));
+ memmove((char *)xh->xh_entries, (char *)xe, len);
+ xe = &xh->xh_entries[count - start];
+ len = sizeof(struct ocfs2_xattr_entry) * start;
+ memset((char *)xe, 0, len);
+
+ le16_add_cpu(&xh->xh_count, -start);
+ le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+ /* Calculate xh_free_start for the new bucket. */
+ xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (le16_to_cpu(xe->xe_name_offset) <
+ le16_to_cpu(xh->xh_free_start))
+ xh->xh_free_start = xe->xe_name_offset;
+ }
+
+set_num_buckets:
+ /* set xh->xh_num_buckets for the new xh. */
+ if (new_bucket_head)
+ xh->xh_num_buckets = cpu_to_le16(1);
+ else
+ xh->xh_num_buckets = 0;
+
+ ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
+
+ /* store the first_hash of the new bucket. */
+ if (first_hash)
+ *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+ /*
+ * Now only update the 1st block of the old bucket. If we
+ * just added a new empty bucket, there is no need to modify
+ * it.
+ */
+ if (start == count)
+ goto out;
+
+ xh = bucket_xh(s_bucket);
+ memset(&xh->xh_entries[start], 0,
+ sizeof(struct ocfs2_xattr_entry) * (count - start));
+ xh->xh_count = cpu_to_le16(start);
+ xh->xh_free_start = cpu_to_le16(name_offset);
+ xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+ ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
+
+out:
+ ocfs2_xattr_bucket_free(s_bucket);
+ ocfs2_xattr_bucket_free(t_bucket);
+
+ return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ u64 s_blkno,
+ u64 t_blkno,
+ int t_is_new)
+{
+ int ret;
+ struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
+
+ BUG_ON(s_blkno == t_blkno);
+
+ trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+ (unsigned long long)t_blkno,
+ t_is_new);
+
+ s_bucket = ocfs2_xattr_bucket_new(inode);
+ t_bucket = ocfs2_xattr_bucket_new(inode);
+ if (!s_bucket || !t_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
+ if (ret)
+ goto out;
+
+ /*
+ * Even if !t_is_new, we're overwriting t_bucket. Thus,
+ * there's no need to read it.
+ */
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
+ if (ret)
+ goto out;
+
+ /*
+ * Hey, if we're overwriting t_bucket, what difference does
+ * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new
+ * cluster to fill, we came here from
+ * ocfs2_mv_xattr_buckets(), and it is really new -
+ * ACCESS_CREATE is required. But we also might have moved data
+ * out of t_bucket before extending back into it.
+ * ocfs2_add_new_xattr_bucket() can do this - its call to
+ * ocfs2_add_new_xattr_cluster() may have created a new extent
+ * and copied out the end of the old extent. Then it re-extends
+ * the old extent back to create space for new xattrs. That's
+ * how we get here, and the bucket isn't really new.
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+ t_is_new ?
+ OCFS2_JOURNAL_ACCESS_CREATE :
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ goto out;
+
+ ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+ ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
+
+out:
+ ocfs2_xattr_bucket_free(t_bucket);
+ ocfs2_xattr_bucket_free(s_bucket);
+
+ return ret;
+}
+
+/*
+ * src_blk points to the start of an existing extent. last_blk points to
+ * last cluster in that extent. to_blk points to a newly allocated
+ * extent. We copy the buckets from the cluster at last_blk to the new
+ * extent. If start_bucket is non-zero, we skip that many buckets before
+ * we start copying. The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied. The old extent's xh_num_buckets shrinks
+ * by the same amount.
+ */
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+ u64 src_blk, u64 last_blk, u64 to_blk,
+ unsigned int start_bucket,
+ u32 *first_hash)
+{
+ int i, ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+ struct ocfs2_xattr_bucket *old_first, *new_first;
+
+ trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+ (unsigned long long)to_blk);
+
+ BUG_ON(start_bucket >= num_buckets);
+ if (start_bucket) {
+ num_buckets -= start_bucket;
+ last_blk += (start_bucket * blks_per_bucket);
+ }
+
+ /* The first bucket of the original extent */
+ old_first = ocfs2_xattr_bucket_new(inode);
+ /* The first bucket of the new extent */
+ new_first = ocfs2_xattr_bucket_new(inode);
+ if (!old_first || !new_first) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(old_first, src_blk);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We need to update the first bucket of the old extent and all
+ * the buckets going to the new extent.
+ */
+ credits = ((num_buckets + 1) * blks_per_bucket);
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < num_buckets; i++) {
+ ret = ocfs2_cp_xattr_bucket(inode, handle,
+ last_blk + (i * blks_per_bucket),
+ to_blk + (i * blks_per_bucket),
+ 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * Get the new bucket ready before we dirty anything
+ * (This actually shouldn't fail, because we already dirtied
+ * it once in ocfs2_cp_xattr_bucket()).
+ */
+ ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Now update the headers */
+ le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+ ocfs2_xattr_bucket_journal_dirty(handle, old_first);
+
+ bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+ ocfs2_xattr_bucket_journal_dirty(handle, new_first);
+
+ if (first_hash)
+ *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
+out:
+ ocfs2_xattr_bucket_free(new_first);
+ ocfs2_xattr_bucket_free(old_first);
+ return ret;
+}
+
+/*
+ * Move some xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
+ handle_t *handle,
+ u64 prev_blk,
+ u64 new_blk,
+ u32 *first_hash)
+{
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int ret, credits = 2 * blk_per_bucket;
+
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Move half of the xattr in start_blk to the next bucket. */
+ return ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
+ new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ * than 1 bucket, so just move half nums of bucket into the new cluster and
+ * update the first_bh and header_bh if the insert bucket has been moved
+ * to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ * a) If the previous extent rec has more than one cluster and the insert
+ * place isn't in the last cluster, copy the entire last cluster to the
+ * new one. This time, we don't need to upate the first_bh and header_bh
+ * since they will not be moved into the new cluster.
+ * b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ * the new one. And we set the extend flag to zero if the insert place is
+ * moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
+ u64 new_blk,
+ u32 prev_clusters,
+ u32 *v_start,
+ int *extend)
+{
+ int ret;
+
+ trace_ocfs2_adjust_xattr_cross_cluster(
+ (unsigned long long)bucket_blkno(first),
+ (unsigned long long)new_blk, prev_clusters);
+
+ if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
+ ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+ handle,
+ first, target,
+ new_blk,
+ prev_clusters,
+ v_start);
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ /* The start of the last cluster in the first extent */
+ u64 last_blk = bucket_blkno(first) +
+ ((prev_clusters - 1) *
+ ocfs2_clusters_to_blocks(inode->i_sb, 1));
+
+ if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+ ret = ocfs2_mv_xattr_buckets(inode, handle,
+ bucket_blkno(first),
+ last_blk, new_blk, 0,
+ v_start);
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ ret = ocfs2_divide_xattr_cluster(inode, handle,
+ last_blk, new_blk,
+ v_start);
+ if (ret)
+ mlog_errno(ret);
+
+ if ((bucket_blkno(target) == last_blk) && extend)
+ *extend = 0;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
+ u32 *num_clusters,
+ u32 prev_cpos,
+ int *extend,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 prev_clusters = *num_clusters;
+ u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+ u64 block;
+ handle_t *handle = ctxt->handle;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_tree et;
+
+ trace_ocfs2_add_new_xattr_cluster_begin(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)bucket_blkno(first),
+ prev_cpos, prev_clusters);
+
+ ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
+
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ BUG_ON(num_bits > clusters_to_add);
+
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
+
+ if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
+ (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+ OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+ /*
+ * If this cluster is contiguous with the old one and
+ * adding this new cluster, we don't surpass the limit of
+ * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+ * initialized and used like other buckets in the previous
+ * cluster.
+ * So add it as a contiguous one. The caller will handle
+ * its init process.
+ */
+ v_start = prev_cpos + prev_clusters;
+ *num_clusters = prev_clusters + num_bits;
+ } else {
+ ret = ocfs2_adjust_xattr_cross_cluster(inode,
+ handle,
+ first,
+ target,
+ block,
+ prev_clusters,
+ &v_start,
+ extend);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+ }
+
+ trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+ v_start, num_bits);
+ ret = ocfs2_insert_extent(handle, &et, v_start, block,
+ num_bits, 0, ctxt->meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ocfs2_journal_dirty(handle, root_bh);
+
+leave:
+ return ret;
+}
+
+/*
+ * We are given an extent. 'first' is the bucket at the very front of
+ * the extent. The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number
+ * of the target bucket. We wish to shift every bucket past the target
+ * down one, filling in that additional space. When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_bucket *first,
+ u64 target_blk,
+ u32 num_clusters)
+{
+ int ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u64 end_blk;
+ u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
+
+ trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+ (unsigned long long)bucket_blkno(first),
+ num_clusters, new_bucket);
+
+ /* The extent must have room for an additional bucket */
+ BUG_ON(new_bucket >=
+ (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
+
+ /* end_blk points to the last existing bucket */
+ end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
+
+ /*
+ * end_blk is the start of the last existing bucket.
+ * Thus, (end_blk - target_blk) covers the target bucket and
+ * every bucket after it up to, but not including, the last
+ * existing bucket. Then we add the last existing bucket, the
+ * new bucket, and the first bucket (3 * blk_per_bucket).
+ */
+ credits = (end_blk - target_blk) + (3 * blk_per_bucket);
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (end_blk != target_blk) {
+ ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+ end_blk + blk_per_bucket, 0);
+ if (ret)
+ goto out;
+ end_blk -= blk_per_bucket;
+ }
+
+ /* Move half of the xattr in target_blkno to the next bucket. */
+ ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+ target_blk + blk_per_bucket, NULL, 0);
+
+ le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+ ocfs2_xattr_bucket_journal_dirty(handle, first);
+
+out:
+ return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly. xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
+ *
+ * If current cluster is full, we'll allocate a new one. This may not
+ * be contiguous. The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+ struct buffer_head *xb_bh,
+ struct ocfs2_xattr_bucket *target,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+ struct ocfs2_extent_list *el = &xb_root->xt_list;
+ u32 name_hash =
+ le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret, num_buckets, extend = 1;
+ u64 p_blkno;
+ u32 e_cpos, num_clusters;
+ /* The bucket at the front of the extent */
+ struct ocfs2_xattr_bucket *first;
+
+ trace_ocfs2_add_new_xattr_bucket(
+ (unsigned long long)bucket_blkno(target));
+
+ /* The first bucket of the original extent */
+ first = ocfs2_xattr_bucket_new(inode);
+ if (!first) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(first, p_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+ if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+ /*
+ * This can move first+target if the target bucket moves
+ * to the new extent.
+ */
+ ret = ocfs2_add_new_xattr_cluster(inode,
+ xb_bh,
+ first,
+ target,
+ &num_clusters,
+ e_cpos,
+ &extend,
+ ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (extend) {
+ ret = ocfs2_extend_xattr_bucket(inode,
+ ctxt->handle,
+ first,
+ bucket_blkno(target),
+ num_clusters);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ ocfs2_xattr_bucket_free(first);
+
+ return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ int offs)
+{
+ int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+ offs = offs % inode->i_sb->s_blocksize;
+ return bucket_block(bucket, block_off) + offs;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ int xe_off,
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret, offset;
+ u64 value_blk;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ size_t blocksize = inode->i_sb->s_blocksize;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
+
+ xe = &xh->xh_entries[xe_off];
+
+ BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+ offset = le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+ value_blk = offset / blocksize;
+
+ /* We don't allow ocfs2_xattr_value to be stored in different block. */
+ BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+
+ vb.vb_bh = bucket->bu_bhs[value_blk];
+ BUG_ON(!vb.vb_bh);
+
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (vb.vb_bh->b_data + offset % blocksize);
+
+ /*
+ * From here on out we have to dirty the bucket. The generic
+ * value calls only modify one of the bucket's bhs, but we need
+ * to send the bucket at once. So if they error, they *could* have
+ * modified something. We have to assume they did, and dirty
+ * the whole bucket. This leaves us in a consistent state.
+ */
+ trace_ocfs2_xattr_bucket_value_truncate(
+ (unsigned long long)bucket_blkno(bucket), xe_off, len);
+ ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xe->xe_value_size = cpu_to_le64(len);
+
+ ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
+out:
+ return ret;
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)root_bh->b_data;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+
+ ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_delete_xattr_in_bucket, para);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ trace_ocfs2_rm_xattr_cluster(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno, cpos, len);
+
+ ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
+ len);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+ ocfs2_journal_dirty(handle, root_bh);
+
+ ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+ if (ret)
+ mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ const char *name)
+{
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+ if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+ return 0;
+
+ if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+ xh->xh_entries[0].xe_name_hash) {
+ mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+ "hash = %u\n",
+ (unsigned long long)bucket_blkno(bucket),
+ le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to set the entry in the current bucket. If we fail, the caller
+ * will handle getting us another bucket.
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ struct ocfs2_xa_loc loc;
+
+ trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
+
+ ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+ xs->not_found ? NULL : xs->here);
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (!ret) {
+ xs->here = loc.xl_entry;
+ goto out;
+ }
+ if (ret != -ENOSPC) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Ok, we need space. Let's try defragmenting the bucket. */
+ ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+ xs->bucket);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (!ret) {
+ xs->here = loc.xl_entry;
+ goto out;
+ }
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+
+
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+
+ trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
+
+ ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+ if (!ret)
+ goto out;
+ if (ret != -ENOSPC) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Ack, need more space. Let's try to get another bucket! */
+
+ /*
+ * We do not allow for overlapping ranges between buckets. And
+ * the maximum number of collisions we will allow for then is
+ * one bucket's worth, so check it here whether we need to
+ * add a new bucket for the insert.
+ */
+ ret = ocfs2_check_xattr_bucket_collision(inode,
+ xs->bucket,
+ xi->xi_name);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_add_new_xattr_bucket(inode,
+ xs->xattr_bh,
+ xs->bucket,
+ ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_add_new_xattr_bucket() will have updated
+ * xs->bucket if it moved, but it will not have updated
+ * any of the other search fields. Thus, we drop it and
+ * re-search. Everything should be cached, so it'll be
+ * quick.
+ */
+ ocfs2_xattr_bucket_relse(xs->bucket);
+ ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+ xi->xi_name_index,
+ xi->xi_name, xs);
+ if (ret && ret != -ENODATA)
+ goto out;
+ xs->not_found = ret;
+
+ /* Ok, we have a new bucket, let's try again */
+ ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+ if (ret && (ret != -ENOSPC))
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0, ref_credits;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ u16 i;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+ int credits = ocfs2_remove_extent_credits(osb->sb) +
+ ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_rm_xattr_bucket_para *args =
+ (struct ocfs2_rm_xattr_bucket_para *)para;
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+ i, &xv, NULL);
+
+ ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+ args->ref_ci,
+ args->ref_root_bh,
+ &ctxt.meta_ac,
+ &ref_credits);
+
+ ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+ i, 0, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac) {
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ctxt.meta_ac = NULL;
+ }
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
+ return ret;
+}
+
+/*
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
+ *
+ * Note:
+ * We have to give the extra credits for the caller.
+ */
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+ handle_t *handle,
+ void *para)
+{
+ int ret;
+ struct ocfs2_xattr_bucket *bucket =
+ (struct ocfs2_xattr_bucket *)para;
+
+ ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+ return 0;
+}
+
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ * will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ * lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_refcount_tree **ref_tree,
+ int *meta_add,
+ int *credits)
+{
+ int ret = 0;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_xattr_entry *xe;
+ char *base;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+ int name_offset, name_len;
+ struct ocfs2_xattr_value_buf vb;
+ struct ocfs2_xattr_bucket *bucket = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_post_refcount refcount;
+ struct ocfs2_post_refcount *p = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+
+ if (!xis->not_found) {
+ xe = xis->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ base = xis->base;
+ vb.vb_bh = xis->inode_bh;
+ vb.vb_access = ocfs2_journal_access_di;
+ } else {
+ int i, block_off = 0;
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+ xe = xbs->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ i = xbs->here - xbs->header->xh_entries;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(xbs->bucket),
+ i, &block_off,
+ &name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ base = bucket_block(xbs->bucket, block_off);
+ vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+ vb.vb_access = ocfs2_journal_access;
+
+ if (ocfs2_meta_ecc(osb)) {
+ /*create parameters for ocfs2_post_refcount. */
+ bucket = xbs->bucket;
+ refcount.credits = bucket->bu_blocks;
+ refcount.para = bucket;
+ refcount.func =
+ ocfs2_xattr_bucket_post_refcount;
+ p = &refcount;
+ }
+ } else {
+ base = xbs->base;
+ vb.vb_bh = xbs->xattr_bh;
+ vb.vb_access = ocfs2_journal_access_xb;
+ }
+ }
+
+ if (ocfs2_xattr_is_local(xe))
+ goto out;
+
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (base + name_offset + name_len);
+
+ ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+ &num_clusters, &vb.vb_xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We just need to check the 1st extent record, since we always
+ * CoW the whole xattr. So there shouldn't be a xattr with
+ * some REFCOUNT extent recs after the 1st one.
+ */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * If we are deleting the xattr or the new size will be stored inside,
+ * cool, leave it there, the xattr truncate process will remove them
+ * for us(it still needs the refcount tree lock and the meta, credits).
+ * And the worse case is that every cluster truncate will split the
+ * refcount tree, and make the original extent become 3. So we will need
+ * 2 * cluster more extent recs at most.
+ */
+ if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+ ret = ocfs2_refcounted_xattr_delete_need(inode,
+ &(*ref_tree)->rf_ci,
+ ref_root_bh, vb.vb_xv,
+ meta_add, credits);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+ *ref_tree, ref_root_bh, 0,
+ le32_to_cpu(vb.vb_xv->xr_clusters), p);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/*
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
+ */
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_extent_tree *value_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *refcount)
+{
+ int ret = 0;
+ u32 clusters = le32_to_cpu(xv->xr_clusters);
+ u32 cpos, p_cluster, num_clusters;
+ struct ocfs2_extent_list *el = &xv->xr_list;
+ unsigned int ext_flags;
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, el, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ cpos += num_clusters;
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+ continue;
+
+ BUG_ON(!p_cluster);
+
+ ret = ocfs2_add_refcount_flag(inode, value_et,
+ ref_ci, ref_root_bh,
+ cpos - num_clusters,
+ p_cluster, num_clusters,
+ dealloc, refcount);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_extent_tree et;
+ int i, ret = 0;
+
+ for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+ xe = &header->xh_entries[i];
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ xv = (struct ocfs2_xattr_value_root *)((void *)header +
+ le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ vb->vb_xv = xv;
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+ ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+ ref_ci, ref_root_bh,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+ (fe_bh->b_data + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = fe_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+ ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+ struct ocfs2_xattr_bucket *bucket,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **bh)
+{
+ int ret, block_off, name_offset;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+ void *base;
+
+ ret = ocfs2_xattr_bucket_get_name_value(sb,
+ bucket_xh(bucket),
+ offset,
+ &block_off,
+ &name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ base = bucket_block(bucket, block_off);
+
+ *xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ if (bh)
+ *bh = bucket->bu_bhs[block_off];
+out:
+ return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int i, ret = 0;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_xattr_tree_value_refcount_para *ref =
+ (struct ocfs2_xattr_tree_value_refcount_para *)para;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
+ struct ocfs2_post_refcount refcount = {
+ .credits = bucket->bu_blocks,
+ .para = bucket,
+ .func = ocfs2_xattr_bucket_post_refcount,
+ };
+ struct ocfs2_post_refcount *p = NULL;
+
+ /* We only need post_refcount if we support metaecc. */
+ if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+ p = &refcount;
+
+ trace_ocfs2_xattr_bucket_value_refcount(
+ (unsigned long long)bucket_blkno(bucket),
+ le16_to_cpu(xh->xh_count));
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+ &vb.vb_xv, &vb.vb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_init_xattr_value_extent_tree(&et,
+ INODE_CACHE(inode), &vb);
+
+ ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+ &et, ref->ref_ci,
+ ref->ref_root_bh,
+ ref->dealloc, p);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para)
+{
+ return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_xattr_bucket_value_refcount,
+ para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+ struct buffer_head *blk_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+
+ ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+ ref_ci, ref_root_bh,
+ dealloc);
+ } else {
+ struct ocfs2_xattr_tree_value_refcount_para para = {
+ .ref_ci = ref_ci,
+ .ref_root_bh = ref_root_bh,
+ .dealloc = dealloc,
+ };
+
+ ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+ ocfs2_refcount_xattr_tree_rec,
+ &para);
+ }
+
+ return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+ ref_ci, ref_root_bh,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (!di->i_xattr_loc)
+ goto out;
+
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+ ref_root_bh, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ brelse(blk_bh);
+out:
+
+ return ret;
+}
+
+typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
+/*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+ struct inode *old_inode;
+ struct inode *new_inode;
+ struct buffer_head *old_bh;
+ struct buffer_head *new_bh;
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+ should_xattr_reflinked *xattr_reflinked;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int *metas, int *credits,
+ int *num_recs,
+ get_xattr_value_root *func,
+ void *para)
+{
+ int i, ret = 0;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe;
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = func(sb, bh, xh, i, &xv, NULL, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ *metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+ le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+ *credits += ocfs2_calc_extend_credits(sb,
+ &def_xv.xv.xr_list);
+
+ /*
+ * If the value is a tree with depth > 1, We don't go deep
+ * to the extent block, so just calculate a maximum record num.
+ */
+ if (!xv->xr_list.l_tree_depth)
+ *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
+ else
+ *num_recs += ocfs2_clusters_for_bytes(sb,
+ XATTR_SIZE_MAX);
+ }
+
+ return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+ *xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+ le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ if (ret_bh)
+ *ret_bh = bh;
+
+ return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+ struct ocfs2_xattr_header *xh,
+ struct buffer_head *ref_root_bh,
+ int *credits,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret, meta_add = 0, num_recs = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ *credits = 0;
+
+ ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+ &meta_add, credits, &num_recs,
+ ocfs2_get_xattr_value_root,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We need to add/modify num_recs in refcount tree, so just calculate
+ * an approximate number we need for refcount tree change.
+ * Sometimes we need to split the tree, and after split, half recs
+ * will be moved to the new block, and a new block can only provide
+ * half number of recs. So we multiple new blocks by 2.
+ */
+ num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+ meta_add += num_recs;
+ *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+ le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+ else
+ *credits += 1;
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ *
+ * If args.xattr_reflinked is set, call it to decide whether the xe should
+ * be reflinked or not. If not, remove it from the new xattr header.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+ struct ocfs2_xattr_reflink *args,
+ struct buffer_head *old_bh,
+ struct ocfs2_xattr_header *xh,
+ struct buffer_head *new_bh,
+ struct ocfs2_xattr_header *new_xh,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_alloc_context *meta_ac,
+ get_xattr_value_root *func,
+ void *para)
+{
+ int ret = 0, i, j;
+ struct super_block *sb = args->old_inode->i_sb;
+ struct buffer_head *value_bh;
+ struct ocfs2_xattr_entry *xe, *last;
+ struct ocfs2_xattr_value_root *xv, *new_xv;
+ struct ocfs2_extent_tree data_et;
+ u32 clusters, cpos, p_cluster, num_clusters;
+ unsigned int ext_flags = 0;
+
+ trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+ le16_to_cpu(xh->xh_count));
+
+ last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+ for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
+ xe = &xh->xh_entries[i];
+
+ if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
+ xe = &new_xh->xh_entries[j];
+
+ le16_add_cpu(&new_xh->xh_count, -1);
+ if (new_xh->xh_count) {
+ memmove(xe, xe + 1,
+ (void *)last - (void *)xe);
+ memset(last, 0,
+ sizeof(struct ocfs2_xattr_entry));
+ }
+
+ /*
+ * We don't want j to increase in the next round since
+ * it is already moved ahead.
+ */
+ j--;
+ continue;
+ }
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * For the xattr which has l_tree_depth = 0, all the extent
+ * recs have already be copied to the new xh with the
+ * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+ * increase the refount count int the refcount tree.
+ *
+ * For the xattr which has l_tree_depth > 0, we need
+ * to initialize it to the empty default value root,
+ * and then insert the extents one by one.
+ */
+ if (xv->xr_list.l_tree_depth) {
+ memcpy(new_xv, &def_xv, sizeof(def_xv));
+ vb->vb_xv = new_xv;
+ vb->vb_bh = value_bh;
+ ocfs2_init_xattr_value_extent_tree(&data_et,
+ INODE_CACHE(args->new_inode), vb);
+ }
+
+ clusters = le32_to_cpu(xv->xr_clusters);
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(args->old_inode,
+ cpos,
+ &p_cluster,
+ &num_clusters,
+ &xv->xr_list,
+ &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!p_cluster);
+
+ if (xv->xr_list.l_tree_depth) {
+ ret = ocfs2_insert_extent(handle,
+ &data_et, cpos,
+ ocfs2_clusters_to_blocks(
+ args->old_inode->i_sb,
+ p_cluster),
+ num_clusters, ext_flags,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_increase_refcount(handle, args->ref_ci,
+ args->ref_root_bh,
+ p_cluster, num_clusters,
+ meta_ac, args->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos += num_clusters;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+ int ret = 0, credits = 0;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+ int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+ int header_off = osb->sb->s_blocksize - inline_size;
+ struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+ (args->old_bh->b_data + header_off);
+ struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+ (args->new_bh->b_data + header_off);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_inode_info *new_oi;
+ struct ocfs2_dinode *new_di;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = args->new_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+ &credits, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+ args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(args->new_bh->b_data + header_off,
+ args->old_bh->b_data + header_off, inline_size);
+
+ new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+ new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+ ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+ args->new_bh, new_xh, &vb, meta_ac,
+ ocfs2_get_xattr_value_root, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_oi = OCFS2_I(args->new_inode);
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ spin_lock(&new_oi->ip_lock);
+ new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+ new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+ spin_unlock(&new_oi->ip_lock);
+
+ ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head **ret_bh,
+ int indexed)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt;
+
+ memset(&ctxt, 0, sizeof(ctxt));
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_create_empty_xattr_block(
+ (unsigned long long)fe_bh->b_blocknr, indexed);
+ ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
+ ret_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+out:
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh,
+ struct buffer_head *new_blk_bh)
+{
+ int ret = 0, credits = 0;
+ handle_t *handle;
+ struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+ struct ocfs2_dinode *new_di;
+ struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+ int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+ struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_block *new_xb =
+ (struct ocfs2_xattr_block *)new_blk_bh->b_data;
+ struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = new_blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+
+ ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+ &credits, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* One more credits in case we need to add xattr flags in new inode. */
+ handle = ocfs2_start_trans(osb, credits + 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+ ret = ocfs2_journal_access_di(handle,
+ INODE_CACHE(args->new_inode),
+ args->new_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+ new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+ osb->sb->s_blocksize - header_off);
+
+ ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+ new_blk_bh, new_xh, &vb, meta_ac,
+ ocfs2_get_xattr_value_root, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_journal_dirty(handle, new_blk_bh);
+
+ if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+ new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+ spin_lock(&new_oi->ip_lock);
+ new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+ new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+ spin_unlock(&new_oi->ip_lock);
+
+ ocfs2_journal_dirty(handle, args->new_bh);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+ struct ocfs2_xattr_reflink *reflink;
+ struct buffer_head *old_blk_bh;
+ struct buffer_head *new_blk_bh;
+ struct ocfs2_xattr_bucket *old_bucket;
+ struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_reflink_xattr_tree_args *args =
+ (struct ocfs2_reflink_xattr_tree_args *)para;
+ struct ocfs2_xattr_bucket *bucket;
+
+ if (bh == args->old_bucket->bu_bhs[0])
+ bucket = args->old_bucket;
+ else
+ bucket = args->new_bucket;
+
+ return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+ xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+ int num_metas;
+ int credits;
+ int num_recs;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_xattr_bucket *bucket =
+ (struct ocfs2_xattr_bucket *)para;
+
+ return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+ xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ struct ocfs2_value_tree_metas *metas =
+ (struct ocfs2_value_tree_metas *)para;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+
+ /* Add the credits for this bucket first. */
+ metas->credits += bucket->bu_blocks;
+ return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+ xh, &metas->num_metas,
+ &metas->credits, &metas->num_recs,
+ ocfs2_value_tree_metas_in_bucket,
+ bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *xt_et,
+ u64 blkno, u32 len, int *credits,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac)
+{
+ int ret, num_free_extents;
+ struct ocfs2_value_tree_metas metas;
+ struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+
+ memset(&metas, 0, sizeof(metas));
+
+ ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+ ocfs2_calc_value_tree_metas, &metas);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *credits = metas.credits;
+
+ /*
+ * Calculate we need for refcount tree change.
+ *
+ * We need to add/modify num_recs in refcount tree, so just calculate
+ * an approximate number we need for refcount tree change.
+ * Sometimes we need to split the tree, and after split, half recs
+ * will be moved to the new block, and a new block can only provide
+ * half number of recs. So we multiple new blocks by 2.
+ * In the end, we have to add credits for modifying the already
+ * existed refcount block.
+ */
+ rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
+ metas.num_recs =
+ (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
+ ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+ metas.num_metas += metas.num_recs;
+ *credits += metas.num_recs +
+ metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+ le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+ else
+ *credits += 1;
+
+ /* count in the xattr tree change. */
+ num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (num_free_extents < len)
+ metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+ *credits += ocfs2_calc_extend_credits(osb->sb,
+ xt_et->et_root_el);
+
+ if (metas.num_metas) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (len) {
+ ret = ocfs2_reserve_clusters(osb, len, data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
+ u64 blkno, u64 new_blkno, u32 clusters,
+ u32 *cpos, int num_buckets,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_reflink_xattr_tree_args *args)
+{
+ int i, j, ret = 0;
+ struct super_block *sb = args->reflink->old_inode->i_sb;
+ int bpb = args->old_bucket->bu_blocks;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
+
+ for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_bucket_journal_access(handle,
+ args->new_bucket,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ for (j = 0; j < bpb; j++)
+ memcpy(bucket_block(args->new_bucket, j),
+ bucket_block(args->old_bucket, j),
+ sb->s_blocksize);
+
+ /*
+ * Record the start cpos so that we can use it to initialize
+ * our xattr tree we also set the xh_num_bucket for the new
+ * bucket.
+ */
+ if (i == 0) {
+ *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+ xh_entries[0].xe_name_hash);
+ bucket_xh(args->new_bucket)->xh_num_buckets =
+ cpu_to_le16(num_buckets);
+ }
+
+ ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+ ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+ args->old_bucket->bu_bhs[0],
+ bucket_xh(args->old_bucket),
+ args->new_bucket->bu_bhs[0],
+ bucket_xh(args->new_bucket),
+ &vb, meta_ac,
+ ocfs2_get_reflink_xattr_value_root,
+ args);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * Re-access and dirty the bucket to calculate metaecc.
+ * Because we may extend the transaction in reflink_xattr_header
+ * which will let the already accessed block gone.
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle,
+ args->new_bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+ ocfs2_xattr_bucket_relse(args->new_bucket);
+ }
+
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+ ocfs2_xattr_bucket_relse(args->new_bucket);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ u64 blkno, u32 cpos, u32 len)
+{
+ int ret, first_inserted = 0;
+ u32 p_cluster, num_clusters, reflink_cpos = 0;
+ u64 new_blkno;
+ unsigned int num_buckets, reflink_buckets;
+ unsigned int bpc =
+ ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+
+ while (len && num_buckets) {
+ ret = ocfs2_claim_clusters(handle, data_ac,
+ 1, &p_cluster, &num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+ ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+ new_blkno, num_clusters,
+ &reflink_cpos, reflink_buckets,
+ meta_ac, data_ac, args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * For the 1st allocated cluster, we make it use the same cpos
+ * so that the xattr tree looks the same as the original one
+ * in the most case.
+ */
+ if (!first_inserted) {
+ reflink_cpos = cpos;
+ first_inserted = 1;
+ }
+ ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+ num_clusters, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+ trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+ num_clusters, reflink_cpos);
+
+ len -= num_clusters;
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ num_buckets -= reflink_buckets;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para)
+{
+ int ret, credits = 0;
+ handle_t *handle;
+ struct ocfs2_reflink_xattr_tree_args *args =
+ (struct ocfs2_reflink_xattr_tree_args *)para;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_extent_tree et;
+
+ trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
+
+ ocfs2_init_xattr_tree_extent_tree(&et,
+ INODE_CACHE(args->reflink->new_inode),
+ args->new_blk_bh);
+
+ ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+ len, &credits,
+ &meta_ac, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+ meta_ac, data_ac,
+ blkno, cpos, len);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh,
+ struct buffer_head *new_blk_bh)
+{
+ int ret;
+ struct ocfs2_reflink_xattr_tree_args para;
+
+ memset(&para, 0, sizeof(para));
+ para.reflink = args;
+ para.old_blk_bh = blk_bh;
+ para.new_blk_bh = new_blk_bh;
+
+ para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+ if (!para.old_bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+ if (!para.new_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+ ocfs2_reflink_xattr_rec,
+ &para);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_xattr_bucket_free(para.old_bucket);
+ ocfs2_xattr_bucket_free(para.new_bucket);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh)
+{
+ int ret, indexed = 0;
+ struct buffer_head *new_blk_bh = NULL;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+ indexed = 1;
+
+ ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+ &new_blk_bh, indexed);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!indexed)
+ ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+ else
+ ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_blk_bh);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
+{
+ int type = ocfs2_xattr_get_type(xe);
+
+ return type != OCFS2_XATTR_INDEX_SECURITY &&
+ type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
+ type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ struct buffer_head *new_bh,
+ bool preserve_security)
+{
+ int ret;
+ struct ocfs2_xattr_reflink args;
+ struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct buffer_head *ref_root_bh = NULL;
+
+ ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ args.old_inode = old_inode;
+ args.new_inode = new_inode;
+ args.old_bh = old_bh;
+ args.new_bh = new_bh;
+ args.ref_ci = &ref_tree->rf_ci;
+ args.ref_root_bh = ref_root_bh;
+ args.dealloc = &dealloc;
+ if (preserve_security)
+ args.xattr_reflinked = NULL;
+ else
+ args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_reflink_xattr_inline(&args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+ }
+
+ if (!di->i_xattr_loc)
+ goto out_unlock;
+
+ ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ brelse(blk_bh);
+
+out_unlock:
+ ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+ ref_tree, 1);
+ brelse(ref_root_bh);
+
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+ ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * Initialize security and acl for a already created inode.
+ * Used for reflink a non-preserve-security file.
+ *
+ * It uses common api like ocfs2_xattr_set, so the caller
+ * must not hold any lock expect i_mutex.
+ */
+int ocfs2_init_security_and_acl(struct inode *dir,
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl)
+{
+ struct buffer_head *dir_bh = NULL;
+ int ret = 0;
+
+ ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_inode_lock(dir, &dir_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ if (!ret && default_acl)
+ ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (!ret && acl)
+ ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+
+ ocfs2_inode_unlock(dir, 0);
+ brelse(dir_bh);
+leave:
+ return ret;
+}
+/*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name,
+ size_t name_len, int type)
+{
+ const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ name, buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ name, value, size, flags);
+}
+
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, XATTR_CREATE);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+int ocfs2_init_security_get(struct inode *inode,
+ struct inode *dir,
+ const struct qstr *qstr,
+ struct ocfs2_security_xattr_info *si)
+{
+ /* check whether ocfs2 support feature xattr */
+ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+ return -EOPNOTSUPP;
+ if (si)
+ return security_old_inode_init_security(inode, dir, qstr,
+ &si->name, &si->value,
+ &si->value_len);
+
+ return security_inode_init_security(inode, dir, qstr,
+ &ocfs2_initxattrs, NULL);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_security_xattr_info *si,
+ struct ocfs2_alloc_context *xattr_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ return ocfs2_xattr_set_handle(handle, inode, di_bh,
+ OCFS2_XATTR_INDEX_SECURITY,
+ si->name, si->value, si->value_len, 0,
+ xattr_ac, data_ac);
+}
+
+const struct xattr_handler ocfs2_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = ocfs2_xattr_security_list,
+ .get = ocfs2_xattr_security_get,
+ .set = ocfs2_xattr_security_set,
+};
+
+/*
+ * 'trusted' attributes support
+ */
+static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name,
+ size_t name_len, int type)
+{
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ name, buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = ocfs2_xattr_trusted_list,
+ .get = ocfs2_xattr_trusted_get,
+ .set = ocfs2_xattr_trusted_set,
+};
+
+/*
+ * 'user' attributes support
+ */
+static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name,
+ size_t name_len, int type)
+{
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+ struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return -EOPNOTSUPP;
+ return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return -EOPNOTSUPP;
+
+ return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ocfs2_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = ocfs2_xattr_user_list,
+ .get = ocfs2_xattr_user_get,
+ .set = ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 00000000000..f10d5b93c36
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,100 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+ OCFS2_XATTR_INDEX_USER = 1,
+ OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+ OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+ OCFS2_XATTR_INDEX_TRUSTED,
+ OCFS2_XATTR_INDEX_SECURITY,
+ OCFS2_XATTR_MAX
+};
+
+struct ocfs2_security_xattr_info {
+ int enable;
+ const char *name;
+ void *value;
+ size_t value_len;
+};
+
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
+
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+ const char *, void *, size_t);
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+ size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+ int, const char *, const void *, size_t, int,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+ struct ocfs2_dinode *di);
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+ const struct qstr *,
+ struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+ struct buffer_head *,
+ struct ocfs2_security_xattr_info *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+ struct ocfs2_security_xattr_info *,
+ int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+ umode_t, struct ocfs2_security_xattr_info *,
+ int *, int *, int *);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block. Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal. Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+ struct buffer_head *vb_bh;
+ ocfs2_journal_access_func vb_access;
+ struct ocfs2_xattr_value_root *vb_xv;
+};
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ struct buffer_head *new_bh,
+ bool preserve_security);
+int ocfs2_init_security_and_acl(struct inode *dir,
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl);
+#endif /* OCFS2_XATTR_H */