aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Kconfig15
-rw-r--r--fs/ocfs2/Makefile16
-rw-r--r--fs/ocfs2/acl.c365
-rw-r--r--fs/ocfs2/acl.h37
-rw-r--r--fs/ocfs2/alloc.c2684
-rw-r--r--fs/ocfs2/alloc.h121
-rw-r--r--fs/ocfs2/aops.c544
-rw-r--r--fs/ocfs2/aops.h40
-rw-r--r--fs/ocfs2/blockcheck.c232
-rw-r--r--fs/ocfs2/blockcheck.h29
-rw-r--r--fs/ocfs2/buffer_head_io.c95
-rw-r--r--fs/ocfs2/buffer_head_io.h8
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1103
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.c22
-rw-r--r--fs/ocfs2/cluster/masklog.h154
-rw-r--r--fs/ocfs2/cluster/netdebug.c372
-rw-r--r--fs/ocfs2/cluster/nodemanager.c61
-rw-r--r--fs/ocfs2/cluster/nodemanager.h7
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/quorum.c27
-rw-r--r--fs/ocfs2/cluster/sys.c11
-rw-r--r--fs/ocfs2/cluster/tcp.c518
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h41
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c158
-rw-r--r--fs/ocfs2/dcache.h10
-rw-r--r--fs/ocfs2/dir.c574
-rw-r--r--fs/ocfs2/dir.h5
-rw-r--r--fs/ocfs2/dlm/Makefile7
-rw-r--r--fs/ocfs2/dlm/dlmapi.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c100
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h193
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c32
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c262
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c647
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlm/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlm/dlmlock.c82
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c650
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c451
-rw-r--r--fs/ocfs2/dlm/dlmthread.c274
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c43
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile5
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c (renamed from fs/ocfs2/dlm/dlmfs.c)229
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c (renamed from fs/ocfs2/dlm/userdlm.c)308
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h (renamed from fs/ocfs2/dlm/userdlm.h)16
-rw-r--r--fs/ocfs2/dlmglue.c971
-rw-r--r--fs/ocfs2/dlmglue.h41
-rw-r--r--fs/ocfs2/export.c82
-rw-r--r--fs/ocfs2/extent_map.c173
-rw-r--r--fs/ocfs2/extent_map.h10
-rw-r--r--fs/ocfs2/file.c1493
-rw-r--r--fs/ocfs2/file.h8
-rw-r--r--fs/ocfs2/heartbeat.c5
-rw-r--r--fs/ocfs2/inode.c496
-rw-r--r--fs/ocfs2/inode.h59
-rw-r--r--fs/ocfs2/ioctl.c891
-rw-r--r--fs/ocfs2/ioctl.h6
-rw-r--r--fs/ocfs2/journal.c560
-rw-r--r--fs/ocfs2/journal.h174
-rw-r--r--fs/ocfs2/localalloc.c455
-rw-r--r--fs/ocfs2/localalloc.h9
-rw-r--r--fs/ocfs2/locks.c5
-rw-r--r--fs/ocfs2/mmap.c132
-rw-r--r--fs/ocfs2/move_extents.c1078
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/namei.c1245
-rw-r--r--fs/ocfs2/namei.h6
-rw-r--r--fs/ocfs2/ocfs2.h299
-rw-r--r--fs/ocfs2/ocfs2_fs.h384
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h242
-rw-r--r--fs/ocfs2/ocfs2_lockid.h11
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h2
-rw-r--r--fs/ocfs2/ocfs2_trace.h2768
-rw-r--r--fs/ocfs2/quota.h26
-rw-r--r--fs/ocfs2/quota_global.c527
-rw-r--r--fs/ocfs2/quota_local.c369
-rw-r--r--fs/ocfs2/refcounttree.c4476
-rw-r--r--fs/ocfs2/refcounttree.h118
-rw-r--r--fs/ocfs2/reservations.c839
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c80
-rw-r--r--fs/ocfs2/slot_map.c32
-rw-r--r--fs/ocfs2/stack_o2cb.c136
-rw-r--r--fs/ocfs2/stack_user.c370
-rw-r--r--fs/ocfs2/stackglue.c170
-rw-r--r--fs/ocfs2/stackglue.h114
-rw-r--r--fs/ocfs2/suballoc.c1498
-rw-r--r--fs/ocfs2/suballoc.h64
-rw-r--r--fs/ocfs2/super.c843
-rw-r--r--fs/ocfs2/super.h21
-rw-r--r--fs/ocfs2/symlink.c122
-rw-r--r--fs/ocfs2/symlink.h2
-rw-r--r--fs/ocfs2/sysfile.c81
-rw-r--r--fs/ocfs2/uptodate.c317
-rw-r--r--fs/ocfs2/uptodate.h51
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/xattr.c4621
-rw-r--r--fs/ocfs2/xattr.h35
107 files changed, 27545 insertions, 10015 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 701b7a3a872..77a8de5f711 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,11 +1,11 @@
config OCFS2_FS
tristate "OCFS2 file system support"
- depends on NET && SYSFS
- select CONFIGFS_FS
+ depends on NET && SYSFS && CONFIGFS_FS
select JBD2
select CRC32
select QUOTA
select QUOTA_TREE
+ select FS_POSIX_ACL
help
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
@@ -50,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
config OCFS2_FS_STATS
bool "OCFS2 statistics"
- depends on OCFS2_FS
+ depends on OCFS2_FS && DEBUG_FS
default y
help
This option allows some fs statistics to be captured. Enabling
@@ -74,12 +74,3 @@ config OCFS2_DEBUG_FS
This option will enable expensive consistency checks. Enable
this option for debugging only as it is likely to decrease
performance of the filesystem.
-
-config OCFS2_FS_POSIX_ACL
- bool "OCFS2 POSIX Access Control Lists"
- depends on OCFS2_FS
- select FS_POSIX_ACL
- default n
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 01596079dd6..ce210d4951a 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+ccflags-y += -DCATCH_BH_JBD_RACES
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
@@ -28,6 +28,9 @@ ocfs2-objs := \
locks.o \
mmap.o \
namei.o \
+ refcounttree.o \
+ reservations.o \
+ move_extents.o \
resize.o \
slot_map.o \
suballoc.o \
@@ -35,19 +38,16 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
quota_local.o \
quota_global.o \
- xattr.o
-
-ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
-ocfs2-objs += acl.o
-endif
+ xattr.o \
+ acl.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
ocfs2_stack_user-objs := stack_user.o
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
# cluster/ is always needed when OCFS2_FS for masklog support
obj-$(CONFIG_OCFS2_FS) += cluster/
obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index fbeaec76210..7e8282dcea2 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,15 +21,17 @@
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/string.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "file.h"
+#include "inode.h"
+#include "journal.h"
#include "ocfs2_fs.h"
#include "xattr.h"
@@ -49,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
return ERR_PTR(-EINVAL);
count = size / sizeof(struct posix_acl_entry);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
@@ -63,7 +61,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ case ACL_GROUP:
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ default:
+ break;
+ }
value += sizeof(struct posix_acl_entry);
}
@@ -89,7 +100,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
for (n = 0; n < acl->a_count; n++, entry++) {
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns,
+ acl->a_entries[n].e_uid));
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns,
+ acl->a_entries[n].e_gid));
+ break;
+ default:
+ entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
}
return ocfs2_acl;
}
@@ -98,15 +123,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
int type,
struct buffer_head *di_bh)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int name_index;
char *value = NULL;
struct posix_acl *acl;
int retval;
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return NULL;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -139,40 +160,68 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
return acl;
}
-
/*
- * Get posix acl.
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
*/
-static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, umode_t new_mode)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- struct posix_acl *acl;
- int ret;
+ int ret, commit_handle = 0;
+ struct ocfs2_dinode *di;
+
+ if (di_bh == NULL) {
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else
+ get_bh(di_bh);
+
+ if (handle == NULL) {
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_brelse;
+ }
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return NULL;
+ commit_handle = 1;
+ }
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
mlog_errno(ret);
- acl = ERR_PTR(ret);
- return acl;
+ goto out_commit;
}
- acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ inode->i_mode = new_mode;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_mode = cpu_to_le16(inode->i_mode);
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+ if (commit_handle)
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
brelse(di_bh);
-
- return acl;
+out:
+ return ret;
}
/*
* Set the access or default ACL of an inode.
*/
-static int ocfs2_set_acl(handle_t *handle,
+int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
@@ -192,14 +241,19 @@ static int ocfs2_set_acl(handle_t *handle,
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
ret = posix_acl_equiv_mode(acl, &mode);
if (ret < 0)
return ret;
else {
- inode->i_mode = mode;
if (ret == 0)
acl = NULL;
+
+ ret = ocfs2_acl_set_mode(inode, di_bh,
+ handle, mode);
+ if (ret)
+ return ret;
+
}
}
break;
@@ -230,250 +284,29 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
-int ocfs2_check_acl(struct inode *inode, int mask)
-{
- struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
-
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int ret = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return ret;
- }
-
- return -EAGAIN;
-}
-
-int ocfs2_acl_chmod(struct inode *inode)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl, *clone;
- int ret;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- ret = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!ret)
- ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
- clone, NULL, NULL);
- posix_acl_release(clone);
- return ret;
-}
-
-/*
- * Initialize the ACLs of a new inode. If parent directory has default ACL,
- * then clone to new inode. Called from ocfs2_mknod.
- */
-int ocfs2_init_acl(handle_t *handle,
- struct inode *inode,
- struct inode *dir,
- struct buffer_head *di_bh,
- struct buffer_head *dir_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl = NULL;
- int ret = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
- acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
- dir_bh);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
- struct posix_acl *clone;
- mode_t mode;
-
- if (S_ISDIR(inode->i_mode)) {
- ret = ocfs2_set_acl(handle, inode, di_bh,
- ACL_TYPE_DEFAULT, acl,
- meta_ac, data_ac);
- if (ret)
- goto cleanup;
- }
- clone = posix_acl_clone(acl, GFP_NOFS);
- ret = -ENOMEM;
- if (!clone)
- goto cleanup;
-
- mode = inode->i_mode;
- ret = posix_acl_create_masq(clone, &mode);
- if (ret >= 0) {
- inode->i_mode = mode;
- if (ret > 0) {
- ret = ocfs2_set_acl(handle, inode,
- di_bh, ACL_TYPE_ACCESS,
- clone, meta_ac, data_ac);
- }
- }
- posix_acl_release(clone);
- }
-cleanup:
- posix_acl_release(acl);
- return ret;
-}
-
-static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int ocfs2_xattr_get_acl(struct inode *inode,
- int type,
- void *buffer,
- size_t size)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl;
- int ret;
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ocfs2_get_acl(inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- ret = posix_acl_to_xattr(acl, buffer, size);
- posix_acl_release(acl);
-
- return ret;
-}
-
-static int ocfs2_xattr_get_acl_access(struct inode *inode,
- const char *name,
- void *buffer,
- size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-
-static int ocfs2_xattr_get_acl_default(struct inode *inode,
- const char *name,
- void *buffer,
- size_t size)
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+ return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
}
-static int ocfs2_xattr_set_acl(struct inode *inode,
- int type,
- const void *value,
- size_t size)
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_super *osb;
+ struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = 0;
+ int ret = -EAGAIN;
+ osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- ret = posix_acl_valid(acl);
- if (ret)
- goto cleanup;
- }
- } else
- acl = NULL;
+ return NULL;
- ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret < 0)
+ return ERR_PTR(ret);
-cleanup:
- posix_acl_release(acl);
- return ret;
-}
+ acl = ocfs2_get_acl_nolock(inode, type, di_bh);
-static int ocfs2_xattr_set_acl_access(struct inode *inode,
- const char *name,
- const void *value,
- size_t size,
- int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
+ brelse(di_bh);
-static int ocfs2_xattr_set_acl_default(struct inode *inode,
- const char *name,
- const void *value,
- size_t size,
- int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+ return acl;
}
-
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .list = ocfs2_xattr_list_acl_access,
- .get = ocfs2_xattr_get_acl_access,
- .set = ocfs2_xattr_set_acl_access,
-};
-
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .list = ocfs2_xattr_list_acl_default,
- .get = ocfs2_xattr_get_acl_default,
- .set = ocfs2_xattr_set_acl_default,
-};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 8f6389ed4da..3fce68d0862 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,33 +26,14 @@ struct ocfs2_acl_entry {
__le32 e_id;
};
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
-
-extern int ocfs2_check_acl(struct inode *, int);
-extern int ocfs2_acl_chmod(struct inode *);
-extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
- struct buffer_head *, struct buffer_head *,
- struct ocfs2_alloc_context *,
- struct ocfs2_alloc_context *);
-
-#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
-
-#define ocfs2_check_acl NULL
-static inline int ocfs2_acl_chmod(struct inode *inode)
-{
- return 0;
-}
-static inline int ocfs2_init_acl(handle_t *handle,
- struct inode *inode,
- struct inode *dir,
- struct buffer_head *di_bh,
- struct buffer_head *dir_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac)
-{
- return 0;
-}
-
-#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 678a067d925..9d8fcf2f3b9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,8 +29,8 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -49,10 +49,22 @@
#include "super.h"
#include "uptodate.h"
#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
+enum ocfs2_contig_type {
+ CONTIG_NONE = 0,
+ CONTIG_LEFT,
+ CONTIG_RIGHT,
+ CONTIG_LEFTRIGHT,
+};
+static enum ocfs2_contig_type
+ ocfs2_extent_rec_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec);
/*
* Operations for a specific extent tree type.
*
@@ -79,18 +91,30 @@ struct ocfs2_extent_tree_operations {
* that value. new_clusters is the delta, and must be
* added to the total. Required.
*/
- void (*eo_update_clusters)(struct inode *inode,
- struct ocfs2_extent_tree *et,
+ void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
u32 new_clusters);
/*
+ * If this extent tree is supported by an extent map, insert
+ * a record into the map.
+ */
+ void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+
+ /*
+ * If this extent tree is supported by an extent map, truncate the
+ * map to clusters,
+ */
+ void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
+ u32 clusters);
+
+ /*
* If ->eo_insert_check() exists, it is called before rec is
* inserted into the extent tree. It is optional.
*/
- int (*eo_insert_check)(struct inode *inode,
- struct ocfs2_extent_tree *et,
+ int (*eo_insert_check)(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
- int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+ int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
/*
* --------------------------------------------------------------
@@ -109,8 +133,17 @@ struct ocfs2_extent_tree_operations {
* it exists. If it does not, et->et_max_leaf_clusters is set
* to 0 (unlimited). Optional.
*/
- void (*eo_fill_max_leaf_clusters)(struct inode *inode,
- struct ocfs2_extent_tree *et);
+ void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+ /*
+ * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
+ * are contiguous or not. Optional. Don't need to set it if use
+ * ocfs2_extent_rec as the tree leaf.
+ */
+ enum ocfs2_contig_type
+ (*eo_extent_contig)(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec);
};
@@ -121,19 +154,22 @@ struct ocfs2_extent_tree_operations {
static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 blkno);
-static void ocfs2_dinode_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters);
-static int ocfs2_dinode_insert_check(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters);
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
-static int ocfs2_dinode_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et);
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
+ .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
+ .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
.eo_insert_check = ocfs2_dinode_insert_check,
.eo_sanity_check = ocfs2_dinode_sanity_check,
.eo_fill_root_el = ocfs2_dinode_fill_root_el,
@@ -156,40 +192,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(di->i_last_eb_blk);
}
-static void ocfs2_dinode_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
struct ocfs2_dinode *di = et->et_object;
le32_add_cpu(&di->i_clusters, clusters);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ spin_lock(&oi->ip_lock);
+ oi->ip_clusters = le32_to_cpu(di->i_clusters);
+ spin_unlock(&oi->ip_lock);
}
-static int ocfs2_dinode_insert_check(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+ ocfs2_extent_map_insert_rec(inode, rec);
+}
+
+static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
+
+ ocfs2_extent_map_trunc(inode, clusters);
+}
+
+static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
+ struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
- BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+ BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
- (OCFS2_I(inode)->ip_clusters !=
- le32_to_cpu(rec->e_cpos)),
+ (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
"Device %s, asking for sparse allocation: inode %llu, "
"cpos %u, clusters %u\n",
osb->dev_str,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- rec->e_cpos,
- OCFS2_I(inode)->ip_clusters);
+ (unsigned long long)oi->ip_blkno,
+ rec->e_cpos, oi->ip_clusters);
return 0;
}
-static int ocfs2_dinode_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
{
struct ocfs2_dinode *di = et->et_object;
@@ -229,8 +278,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
}
-static void ocfs2_xattr_value_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_xattr_value_buf *vb = et->et_object;
@@ -252,12 +300,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
}
-static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
{
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
et->et_max_leaf_clusters =
- ocfs2_clusters_for_bytes(inode->i_sb,
- OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+ ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
}
static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -277,8 +324,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(xt->xt_last_eb_blk);
}
-static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_xattr_block *xb = et->et_object;
@@ -309,8 +355,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
return le64_to_cpu(dx_root->dr_last_eb_blk);
}
-static void ocfs2_dx_root_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -318,8 +363,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode,
le32_add_cpu(&dx_root->dr_clusters, clusters);
}
-static int ocfs2_dx_root_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
{
struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -343,8 +387,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_fill_root_el = ocfs2_dx_root_fill_root_el,
};
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_refcount_block *rb = et->et_object;
+
+ le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
+ .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
+ .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
+};
+
static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
@@ -352,6 +442,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
{
et->et_ops = ops;
et->et_root_bh = bh;
+ et->et_ci = ci;
et->et_root_journal_access = access;
if (!obj)
obj = (void *)bh->b_data;
@@ -361,41 +452,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
if (!et->et_ops->eo_fill_max_leaf_clusters)
et->et_max_leaf_clusters = 0;
else
- et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+ et->et_ops->eo_fill_max_leaf_clusters(et);
}
void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
NULL, &ocfs2_dinode_et_ops);
}
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
NULL, &ocfs2_xattr_tree_et_ops);
}
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct ocfs2_xattr_value_buf *vb)
{
- __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
+ __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
&ocfs2_xattr_value_et_ops);
}
void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
NULL, &ocfs2_dx_root_et_ops);
}
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+ NULL, &ocfs2_refcount_tree_et_ops);
+}
+
static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 new_last_eb_blk)
{
@@ -407,80 +506,78 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
return et->et_ops->eo_get_last_eb_blk(et);
}
-static inline void ocfs2_et_update_clusters(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
u32 clusters)
{
- et->et_ops->eo_update_clusters(inode, et, clusters);
+ et->et_ops->eo_update_clusters(et, clusters);
+}
+
+static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ if (et->et_ops->eo_extent_map_insert)
+ et->et_ops->eo_extent_map_insert(et, rec);
+}
+
+static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ if (et->et_ops->eo_extent_map_truncate)
+ et->et_ops->eo_extent_map_truncate(et, clusters);
}
static inline int ocfs2_et_root_journal_access(handle_t *handle,
- struct inode *inode,
struct ocfs2_extent_tree *et,
int type)
{
- return et->et_root_journal_access(handle, inode, et->et_root_bh,
+ return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
type);
}
-static inline int ocfs2_et_insert_check(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static inline enum ocfs2_contig_type
+ ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ if (et->et_ops->eo_extent_contig)
+ return et->et_ops->eo_extent_contig(et, rec, insert_rec);
+
+ return ocfs2_extent_rec_contig(
+ ocfs2_metadata_cache_get_super(et->et_ci),
+ rec, insert_rec);
+}
+
+static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec)
{
int ret = 0;
if (et->et_ops->eo_insert_check)
- ret = et->et_ops->eo_insert_check(inode, et, rec);
+ ret = et->et_ops->eo_insert_check(et, rec);
return ret;
}
-static inline int ocfs2_et_sanity_check(struct inode *inode,
- struct ocfs2_extent_tree *et)
+static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
{
int ret = 0;
if (et->et_ops->eo_sanity_check)
- ret = et->et_ops->eo_sanity_check(inode, et);
+ ret = et->et_ops->eo_sanity_check(et);
return ret;
}
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
-
-/*
- * Structures which describe a path through a btree, and functions to
- * manipulate them.
- *
- * The idea here is to be as generic as possible with the tree
- * manipulation code.
- */
-struct ocfs2_path_item {
- struct buffer_head *bh;
- struct ocfs2_extent_list *el;
-};
-
-#define OCFS2_MAX_PATH_DEPTH 5
-
-struct ocfs2_path {
- int p_tree_depth;
- ocfs2_journal_access_func p_root_access;
- struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
-};
-
-#define path_root_bh(_path) ((_path)->p_node[0].bh)
-#define path_root_el(_path) ((_path)->p_node[0].el)
-#define path_root_access(_path)((_path)->p_root_access)
-#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_rec *insert_rec);
/*
* Reset the actual path elements so that we can re-use the structure
* to build another path. Generally, this involves freeing the buffer
* heads.
*/
-static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
{
int i, start = 0, depth = 0;
struct ocfs2_path_item *node;
@@ -509,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
path->p_tree_depth = depth;
}
-static void ocfs2_free_path(struct ocfs2_path *path)
+void ocfs2_free_path(struct ocfs2_path *path)
{
if (path) {
ocfs2_reinit_path(path, 0);
@@ -607,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
return path;
}
-static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
{
return ocfs2_new_path(path_root_bh(path), path_root_el(path),
path_root_access(path));
}
-static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
{
return ocfs2_new_path(et->et_root_bh, et->et_root_el,
et->et_root_journal_access);
@@ -626,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
* I don't like the way this function's name looks next to
* ocfs2_journal_access_path(), but I don't have a better one.
*/
-static int ocfs2_path_bh_journal_access(handle_t *handle,
- struct inode *inode,
- struct ocfs2_path *path,
- int idx)
+int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ int idx)
{
ocfs2_journal_access_func access = path_root_access(path);
@@ -639,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
if (idx)
access = ocfs2_journal_access_eb;
- return access(handle, inode, path->p_node[idx].bh,
+ return access(handle, ci, path->p_node[idx].bh,
OCFS2_JOURNAL_ACCESS_WRITE);
}
/*
* Convenience function to journal all components in a path.
*/
-static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
- struct ocfs2_path *path)
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+ handle_t *handle,
+ struct ocfs2_path *path)
{
int i, ret = 0;
@@ -655,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
goto out;
for(i = 0; i < path_num_items(path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
+ ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -696,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
return ret;
}
-enum ocfs2_contig_type {
- CONTIG_NONE = 0,
- CONTIG_LEFT,
- CONTIG_RIGHT,
- CONTIG_LEFTRIGHT,
-};
-
-
/*
* NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
- * ocfs2_extent_contig only work properly against leaf nodes!
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
*/
static int ocfs2_block_extent_contig(struct super_block *sb,
struct ocfs2_extent_rec *ext,
@@ -732,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
}
static enum ocfs2_contig_type
- ocfs2_extent_contig(struct inode *inode,
- struct ocfs2_extent_rec *ext,
- struct ocfs2_extent_rec *insert_rec)
+ ocfs2_extent_rec_contig(struct super_block *sb,
+ struct ocfs2_extent_rec *ext,
+ struct ocfs2_extent_rec *insert_rec)
{
u64 blkno = le64_to_cpu(insert_rec->e_blkno);
@@ -747,12 +837,12 @@ static enum ocfs2_contig_type
return CONTIG_NONE;
if (ocfs2_extents_adjacent(ext, insert_rec) &&
- ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+ ocfs2_block_extent_contig(sb, ext, blkno))
return CONTIG_RIGHT;
blkno = le64_to_cpu(ext->e_blkno);
if (ocfs2_extents_adjacent(insert_rec, ext) &&
- ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+ ocfs2_block_extent_contig(sb, insert_rec, blkno))
return CONTIG_LEFT;
return CONTIG_NONE;
@@ -797,8 +887,7 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
struct ocfs2_extent_block *eb =
(struct ocfs2_extent_block *)bh->b_data;
- mlog(0, "Validating extent block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -847,13 +936,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
return 0;
}
-int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
struct buffer_head **bh)
{
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+ rc = ocfs2_read_block(ci, eb_blkno, &tmp,
ocfs2_validate_extent_block);
/* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -868,7 +957,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
* How many free extents have we got before we need more meta data?
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct inode *inode,
struct ocfs2_extent_tree *et)
{
int retval;
@@ -877,13 +965,12 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct buffer_head *eb_bh = NULL;
u64 last_eb_blk = 0;
- mlog_entry_void();
-
el = et->et_root_el;
last_eb_blk = ocfs2_et_get_last_eb_blk(et);
if (last_eb_blk) {
- retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
+ retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
+ &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
@@ -898,7 +985,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
bail:
brelse(eb_bh);
- mlog_exit(retval);
+ trace_ocfs2_num_free_extents(retval);
return retval;
}
@@ -907,9 +994,8 @@ bail:
* sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
* l_count for you
*/
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+static int ocfs2_create_new_meta_bhs(handle_t *handle,
+ struct ocfs2_extent_tree *et,
int wanted,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head *bhs[])
@@ -917,17 +1003,17 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
int count, status, i;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
struct ocfs2_extent_block *eb;
- mlog_entry_void();
-
count = 0;
while (count < wanted) {
- status = ocfs2_claim_metadata(osb,
- handle,
+ status = ocfs2_claim_metadata(handle,
meta_ac,
wanted - count,
+ &suballoc_loc,
&suballoc_bit_start,
&num_got,
&first_blkno);
@@ -939,13 +1025,14 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
for(i = count; i < (num_got + count); i++) {
bhs[i] = sb_getblk(osb->sb, first_blkno);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+ ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
- status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+ status = ocfs2_journal_access_eb(handle, et->et_ci,
+ bhs[i],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -958,7 +1045,9 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
eb->h_blkno = cpu_to_le64(first_blkno);
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
- eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+ eb->h_suballoc_slot =
+ cpu_to_le16(meta_ac->ac_alloc_slot);
+ eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -968,11 +1057,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
/* We'll also be dirtied by the caller, so
* this isn't absolutely necessary. */
- status = ocfs2_journal_dirty(handle, bhs[i]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[i]);
}
count += num_got;
@@ -985,8 +1070,8 @@ bail:
brelse(bhs[i]);
bhs[i] = NULL;
}
+ mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -1013,8 +1098,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
}
/*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+ struct ocfs2_extent_tree *et)
+{
+ int status;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ status = -ENOMEM;
+ return status;
+ }
+
+ status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_extend_trans(handle, path_num_items(path));
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
+
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
+
+out:
+ ocfs2_free_path(path);
+ return status;
+}
+
+/*
* Add an entire tree branch to our inode. eb_bh is the extent block
- * to start at, if we don't want to start the branch at the dinode
+ * to start at, if we don't want to start the branch at the root
* structure.
*
* last_eb_bh is required as we have to update it's next_leaf pointer
@@ -1023,9 +1154,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
* the new branch will be 'empty' in the sense that every block will
* contain a single record with cluster count == 0.
*/
-static int ocfs2_add_branch(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+static int ocfs2_add_branch(handle_t *handle,
struct ocfs2_extent_tree *et,
struct buffer_head *eb_bh,
struct buffer_head **last_eb_bh,
@@ -1038,9 +1167,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el;
- u32 new_cpos;
-
- mlog_entry_void();
+ u32 new_cpos, root_end;
BUG_ON(!last_eb_bh || !*last_eb_bh);
@@ -1055,6 +1182,30 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
new_blocks = le16_to_cpu(el->l_tree_depth);
+ eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+ new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+ root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+
+ /*
+ * If there is a gap before the root end and the real end
+ * of the righmost leaf block, we need to remove the gap
+ * between new_cpos and root_end first so that the tree
+ * is consistent after we add a new branch(it will start
+ * from new_cpos).
+ */
+ if (root_end > new_cpos) {
+ trace_ocfs2_adjust_rightmost_branch(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ root_end, new_cpos);
+
+ status = ocfs2_adjust_rightmost_branch(handle, et);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
/* allocate the number of new eb blocks we need */
new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -1064,16 +1215,13 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+ status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
meta_ac, new_eb_bhs);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
- new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
-
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
* linked with the rest of the tree.
* conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -1089,7 +1237,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
- status = ocfs2_journal_access_eb(handle, inode, bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1114,12 +1262,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
+ ocfs2_journal_dirty(handle, bh);
next_blkno = le64_to_cpu(eb->h_blkno);
}
@@ -1129,20 +1272,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_et_root_journal_access(handle, inode, et,
+ status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (eb_bh) {
- status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1165,17 +1308,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
- status = ocfs2_journal_dirty(handle, *last_eb_bh);
- if (status < 0)
- mlog_errno(status);
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0)
- mlog_errno(status);
- if (eb_bh) {
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0)
- mlog_errno(status);
- }
+ ocfs2_journal_dirty(handle, *last_eb_bh);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (eb_bh)
+ ocfs2_journal_dirty(handle, eb_bh);
/*
* Some callers want to track the rightmost leaf so pass it
@@ -1193,7 +1329,6 @@ bail:
kfree(new_eb_bhs);
}
- mlog_exit(status);
return status;
}
@@ -1202,9 +1337,7 @@ bail:
* returns back the new extent block so you can add a branch to it
* after this call.
*/
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+static int ocfs2_shift_tree_depth(handle_t *handle,
struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
@@ -1216,9 +1349,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
- mlog_entry_void();
-
- status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+ status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
&new_eb_bh);
if (status < 0) {
mlog_errno(status);
@@ -1232,7 +1363,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
eb_el = &eb->h_list;
root_el = et->et_root_el;
- status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+ status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1245,13 +1376,9 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
eb_el->l_recs[i] = root_el->l_recs[i];
- status = ocfs2_journal_dirty(handle, new_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_eb_bh);
- status = ocfs2_et_root_journal_access(handle, inode, et,
+ status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1274,11 +1401,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
if (root_el->l_tree_depth == cpu_to_le16(1))
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
*ret_new_eb_bh = new_eb_bh;
new_eb_bh = NULL;
@@ -1286,7 +1409,6 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
bail:
brelse(new_eb_bh);
- mlog_exit(status);
return status;
}
@@ -1307,9 +1429,7 @@ bail:
*
* return status < 0 indicates an error.
*/
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
- struct inode *inode,
- struct ocfs2_extent_tree *et,
+static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
struct buffer_head **target_bh)
{
int status = 0, i;
@@ -1319,27 +1439,27 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
struct buffer_head *bh = NULL;
struct buffer_head *lowest_bh = NULL;
- mlog_entry_void();
-
*target_bh = NULL;
el = et->et_root_el;
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(inode->i_sb, "Dinode %llu has empty "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty "
"extent list (next_free_rec == 0)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
- ocfs2_error(inode->i_sb, "Dinode %llu has extent "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has extent "
"list where extent # %d has no physical "
"block start",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
}
@@ -1347,7 +1467,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
brelse(bh);
bh = NULL;
- status = ocfs2_read_extent_block(inode, blkno, &bh);
+ status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1374,7 +1494,6 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
bail:
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -1388,20 +1507,18 @@ bail:
*
* *last_eb_bh will be updated by ocfs2_add_branch().
*/
-static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
- struct ocfs2_extent_tree *et, int *final_depth,
- struct buffer_head **last_eb_bh,
+static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ int *final_depth, struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret, shift;
struct ocfs2_extent_list *el = et->et_root_el;
int depth = le16_to_cpu(el->l_tree_depth);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *bh = NULL;
BUG_ON(meta_ac == NULL);
- shift = ocfs2_find_branch_target(osb, inode, et, &bh);
+ shift = ocfs2_find_branch_target(et, &bh);
if (shift < 0) {
ret = shift;
mlog_errno(ret);
@@ -1413,13 +1530,15 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
* another tree level */
if (shift) {
BUG_ON(bh);
- mlog(0, "need to shift tree depth (current = %d)\n", depth);
+ trace_ocfs2_grow_tree(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ depth);
/* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to
* ocfs2_add_branch). */
- ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
- meta_ac, &bh);
+ ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1444,8 +1563,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* call ocfs2_add_branch to add the final part of the tree with
* the new data. */
- mlog(0, "add branch. bh = %p\n", bh);
- ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
+ ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
meta_ac);
if (ret < 0) {
mlog_errno(ret);
@@ -1519,8 +1637,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
}
insert_index = i;
- mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
- insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+ trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+ has_empty, next_free,
+ le16_to_cpu(el->l_count));
BUG_ON(insert_index < 0);
BUG_ON(insert_index >= le16_to_cpu(el->l_count));
@@ -1615,9 +1734,9 @@ set_and_inc:
*
* The array index of the subtree root is passed back.
*/
-static int ocfs2_find_subtree_root(struct inode *inode,
- struct ocfs2_path *left,
- struct ocfs2_path *right)
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right)
{
int i = 0;
@@ -1633,10 +1752,10 @@ static int ocfs2_find_subtree_root(struct inode *inode,
* The caller didn't pass two adjacent paths.
*/
mlog_bug_on_msg(i > left->p_tree_depth,
- "Inode %lu, left depth %u, right depth %u\n"
+ "Owner %llu, left depth %u, right depth %u\n"
"left leaf blk %llu, right leaf blk %llu\n",
- inode->i_ino, left->p_tree_depth,
- right->p_tree_depth,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ left->p_tree_depth, right->p_tree_depth,
(unsigned long long)path_leaf_bh(left)->b_blocknr,
(unsigned long long)path_leaf_bh(right)->b_blocknr);
} while (left->p_node[i].bh->b_blocknr ==
@@ -1653,7 +1772,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *);
* This code can be called with a cpos larger than the tree, in which
* case it will return the rightmost path.
*/
-static int __ocfs2_find_path(struct inode *inode,
+static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
struct ocfs2_extent_list *root_el, u32 cpos,
path_insert_t *func, void *data)
{
@@ -1664,15 +1783,14 @@ static int __ocfs2_find_path(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
el = root_el;
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has empty extent list at "
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has empty extent list at "
"depth %u\n",
- (unsigned long long)oi->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
goto out;
@@ -1695,10 +1813,10 @@ static int __ocfs2_find_path(struct inode *inode,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has bad blkno in extent list "
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has bad blkno in extent list "
"at depth %u (index %d)\n",
- (unsigned long long)oi->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
goto out;
@@ -1706,7 +1824,7 @@ static int __ocfs2_find_path(struct inode *inode,
brelse(bh);
bh = NULL;
- ret = ocfs2_read_extent_block(inode, blkno, &bh);
+ ret = ocfs2_read_extent_block(ci, blkno, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1717,10 +1835,10 @@ static int __ocfs2_find_path(struct inode *inode,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has bad count in extent list "
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has bad count in extent list "
"at block %llu (next free=%u, count=%u)\n",
- (unsigned long long)oi->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
le16_to_cpu(el->l_count));
@@ -1764,14 +1882,14 @@ static void find_path_ins(void *data, struct buffer_head *bh)
ocfs2_path_insert_eb(fp->path, fp->index, bh);
fp->index++;
}
-static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
- u32 cpos)
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path, u32 cpos)
{
struct find_path_data data;
data.index = 1;
data.path = path;
- return __ocfs2_find_path(inode, path_root_el(path), cpos,
+ return __ocfs2_find_path(ci, path_root_el(path), cpos,
find_path_ins, &data);
}
@@ -1796,13 +1914,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
*
* This function doesn't handle non btree extent lists.
*/
-int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
- u32 cpos, struct buffer_head **leaf_bh)
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ struct buffer_head **leaf_bh)
{
int ret;
struct buffer_head *bh = NULL;
- ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+ ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1842,7 +1961,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
* immediately to their right.
*/
left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
- if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+ if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+ BUG_ON(right_child_el->l_tree_depth);
BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
}
@@ -1907,12 +2027,12 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
* - When we've adjusted the last extent record in the left path leaf and the
* 1st extent record in the right path leaf during cross extent block merge.
*/
-static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+static void ocfs2_complete_edge_insert(handle_t *handle,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index)
{
- int ret, i, idx;
+ int i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -1932,7 +2052,7 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
left_el = path_leaf_el(left_path);
right_el = path_leaf_el(right_path);
for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
- mlog(0, "Adjust records at index %u\n", i);
+ trace_ocfs2_complete_edge_insert(i);
/*
* One nice property of knowing that all of these
@@ -1950,13 +2070,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
right_el);
- ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
- if (ret)
- mlog_errno(ret);
-
- ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+ ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
/*
* Setup our list pointers now so that the current
@@ -1980,13 +2095,11 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
root_bh = left_path->p_node[subtree_index].bh;
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, root_bh);
}
-static int ocfs2_rotate_subtree_right(struct inode *inode,
- handle_t *handle,
+static int ocfs2_rotate_subtree_right(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index)
@@ -2002,10 +2115,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
left_el = path_leaf_el(left_path);
if (left_el->l_next_free_rec != left_el->l_count) {
- ocfs2_error(inode->i_sb,
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
"Inode %llu has non-full interior leaf node %llu"
"(next free = %u)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
return -EROFS;
@@ -2021,7 +2134,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -2029,14 +2142,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -2050,16 +2163,12 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
/* This is a code error, not a disk corruption. */
mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
"because rightmost leaf block %llu is empty\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)right_leaf_bh->b_blocknr);
ocfs2_create_empty_extent(right_el);
- ret = ocfs2_journal_dirty(handle, right_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, right_leaf_bh);
/* Do the copy now. */
i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2078,14 +2187,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&left_el->l_next_free_rec, 1);
- ret = ocfs2_journal_dirty(handle, left_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, left_leaf_bh);
- ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
- subtree_index);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
out:
return ret;
@@ -2097,8 +2202,8 @@ out:
*
* Will return zero if the path passed in is already the leftmost path.
*/
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
- struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
@@ -2175,12 +2280,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int op_credits,
struct ocfs2_path *path)
{
+ int ret = 0;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
if (handle->h_buffer_credits < credits)
- return ocfs2_extend_trans(handle, credits);
+ ret = ocfs2_extend_trans(handle,
+ credits - handle->h_buffer_credits);
- return 0;
+ return ret;
}
/*
@@ -2239,7 +2346,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
*
* The array is assumed to be large enough to hold an entire path (tree depth).
*
- * Upon succesful return from this function:
+ * Upon successful return from this function:
*
* - The 'right_path' array will contain a path to the leaf block
* whose range contains e_cpos.
@@ -2248,8 +2355,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
* *ret_left_path will contain a valid path which can be passed to
* ocfs2_insert_path().
*/
-static int ocfs2_rotate_tree_right(struct inode *inode,
- handle_t *handle,
+static int ocfs2_rotate_tree_right(handle_t *handle,
+ struct ocfs2_extent_tree *et,
enum ocfs2_split_type split,
u32 insert_cpos,
struct ocfs2_path *right_path,
@@ -2258,6 +2365,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
int ret, start, orig_credits = handle->h_buffer_credits;
u32 cpos;
struct ocfs2_path *left_path = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
*ret_left_path = NULL;
@@ -2268,13 +2376,15 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out;
}
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
- mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+ trace_ocfs2_rotate_tree_right(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos);
/*
* What we want to do here is:
@@ -2303,10 +2413,12 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
* rotating subtrees.
*/
while (cpos && insert_cpos <= cpos) {
- mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
- insert_cpos, cpos);
+ trace_ocfs2_rotate_tree_right(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos);
- ret = ocfs2_find_path(inode, left_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2314,10 +2426,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
mlog_bug_on_msg(path_leaf_bh(left_path) ==
path_leaf_bh(right_path),
- "Inode %lu: error during insert of %u "
+ "Owner %llu: error during insert of %u "
"(left path cpos %u) results in two identical "
"paths ending at %llu\n",
- inode->i_ino, insert_cpos, cpos,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos,
(unsigned long long)
path_leaf_bh(left_path)->b_blocknr);
@@ -2343,12 +2456,12 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out_ret_path;
}
- start = ocfs2_find_subtree_root(inode, left_path, right_path);
+ start = ocfs2_find_subtree_root(et, left_path, right_path);
- mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
- start,
- (unsigned long long) right_path->p_node[start].bh->b_blocknr,
- right_path->p_tree_depth);
+ trace_ocfs2_rotate_subtree(start,
+ (unsigned long long)
+ right_path->p_node[start].bh->b_blocknr,
+ right_path->p_tree_depth);
ret = ocfs2_extend_rotate_transaction(handle, start,
orig_credits, right_path);
@@ -2357,7 +2470,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out;
}
- ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+ ret = ocfs2_rotate_subtree_right(handle, et, left_path,
right_path, start);
if (ret) {
mlog_errno(ret);
@@ -2389,8 +2502,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
*/
ocfs2_mv_path(right_path, left_path);
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
- &cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2404,15 +2516,37 @@ out_ret_path:
return ret;
}
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
- struct ocfs2_path *path)
+static int ocfs2_update_edge_lengths(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ int subtree_index, struct ocfs2_path *path)
{
- int i, idx;
+ int i, idx, ret;
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_list *el;
struct ocfs2_extent_block *eb;
u32 range;
+ /*
+ * In normal tree rotation process, we will never touch the
+ * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+ * doesn't reserve the credits for them either.
+ *
+ * But we do have a special case here which will update the rightmost
+ * records for all the bh in the path.
+ * So we have to allocate extra credits and access them.
+ */
+ ret = ocfs2_extend_trans(handle, subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* Path should always be rightmost. */
eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2433,9 +2567,12 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
ocfs2_journal_dirty(handle, path->p_node[i].bh);
}
+out:
+ return ret;
}
-static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+static void ocfs2_unlink_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_cached_dealloc_ctxt *dealloc,
struct ocfs2_path *path, int unlink_start)
{
@@ -2457,12 +2594,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
mlog(ML_ERROR,
"Inode %llu, attempted to remove extent block "
"%llu with %u records\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(el->l_next_free_rec));
ocfs2_journal_dirty(handle, bh);
- ocfs2_remove_from_cache(inode, bh);
+ ocfs2_remove_from_cache(et->et_ci, bh);
continue;
}
@@ -2475,11 +2612,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
if (ret)
mlog_errno(ret);
- ocfs2_remove_from_cache(inode, bh);
+ ocfs2_remove_from_cache(et->et_ci, bh);
}
}
-static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+static void ocfs2_unlink_subtree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index,
@@ -2510,17 +2648,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
ocfs2_journal_dirty(handle, root_bh);
ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- ocfs2_unlink_path(inode, handle, dealloc, right_path,
+ ocfs2_unlink_path(handle, et, dealloc, right_path,
subtree_index + 1);
}
-static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+static int ocfs2_rotate_subtree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
int subtree_index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- int *deleted,
- struct ocfs2_extent_tree *et)
+ int *deleted)
{
int ret, i, del_right_subtree = 0, right_has_empty = 0;
struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
@@ -2556,7 +2694,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
return -EAGAIN;
if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
- ret = ocfs2_journal_access_eb(handle, inode,
+ ret = ocfs2_journal_access_eb(handle, et->et_ci,
path_leaf_bh(right_path),
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2575,7 +2713,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
* We have to update i_last_eb_blk during the meta
* data delete.
*/
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -2591,7 +2729,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
*/
BUG_ON(right_has_empty && !del_right_subtree);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -2599,14 +2737,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -2635,17 +2773,18 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
ocfs2_remove_empty_extent(right_leaf_el);
}
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
if (del_right_subtree) {
- ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+ ocfs2_unlink_subtree(handle, et, left_path, right_path,
subtree_index, dealloc);
- ocfs2_update_edge_lengths(inode, handle, left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -2658,13 +2797,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
if (right_has_empty)
ocfs2_remove_empty_extent(left_leaf_el);
- ret = ocfs2_journal_dirty(handle, et_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, et_root_bh);
*deleted = 1;
} else
- ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
subtree_index);
out:
@@ -2680,8 +2817,8 @@ out:
* This looks similar, but is subtly different to
* ocfs2_find_cpos_for_left_leaf().
*/
-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
- struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
@@ -2750,8 +2887,8 @@ out:
return ret;
}
-static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
- handle_t *handle,
+static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path)
{
int ret;
@@ -2761,7 +2898,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
if (!ocfs2_is_empty_extent(&el->l_recs[0]))
return 0;
- ret = ocfs2_path_bh_journal_access(handle, inode, path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
path_num_items(path) - 1);
if (ret) {
mlog_errno(ret);
@@ -2769,33 +2906,30 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
}
ocfs2_remove_empty_extent(el);
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out:
return ret;
}
-static int __ocfs2_rotate_tree_left(struct inode *inode,
- handle_t *handle, int orig_credits,
+static int __ocfs2_rotate_tree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ int orig_credits,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_path **empty_extent_path,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_path **empty_extent_path)
{
int ret, subtree_root, deleted;
u32 right_cpos;
struct ocfs2_path *left_path = NULL;
struct ocfs2_path *right_path = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
*empty_extent_path = NULL;
- ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
- &right_cpos);
+ ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2818,17 +2952,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
}
while (right_cpos) {
- ret = ocfs2_find_path(inode, right_path, right_cpos);
+ ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
- subtree_root = ocfs2_find_subtree_root(inode, left_path,
+ subtree_root = ocfs2_find_subtree_root(et, left_path,
right_path);
- mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
- subtree_root,
+ trace_ocfs2_rotate_subtree(subtree_root,
(unsigned long long)
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
@@ -2844,16 +2977,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
* Caller might still want to make changes to the
* tree root, so re-add it to the journal here.
*/
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, 0);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+ ret = ocfs2_rotate_subtree_left(handle, et, left_path,
right_path, subtree_root,
- dealloc, &deleted, et);
+ dealloc, &deleted);
if (ret == -EAGAIN) {
/*
* The rotation has to temporarily stop due to
@@ -2880,7 +3013,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
ocfs2_mv_path(left_path, right_path);
- ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
&right_cpos);
if (ret) {
mlog_errno(ret);
@@ -2895,10 +3028,10 @@ out:
return ret;
}
-static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+static int ocfs2_remove_rightmost_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret, subtree_index;
u32 cpos;
@@ -2907,7 +3040,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
struct ocfs2_extent_list *el;
- ret = ocfs2_et_sanity_check(inode, et);
+ ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
/*
@@ -2922,13 +3055,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2946,23 +3080,28 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, left_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
if (ret) {
mlog_errno(ret);
goto out;
}
- subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
- ocfs2_unlink_subtree(inode, handle, left_path, path,
+ ocfs2_unlink_subtree(handle, et, left_path, path,
subtree_index, dealloc);
- ocfs2_update_edge_lengths(inode, handle, left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -2971,10 +3110,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
* 'path' is also the leftmost path which
* means it must be the only one. This gets
* handled differently because we want to
- * revert the inode back to having extents
+ * revert the root back to having extents
* in-line.
*/
- ocfs2_unlink_path(inode, handle, dealloc, path, 1);
+ ocfs2_unlink_path(handle, et, dealloc, path, 1);
el = et->et_root_el;
el->l_tree_depth = 0;
@@ -3007,10 +3146,10 @@ out:
* the rightmost tree leaf record is removed so the caller is
* responsible for detecting and correcting that.
*/
-static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+static int ocfs2_rotate_tree_left(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret, orig_credits = handle->h_buffer_credits;
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -3027,8 +3166,7 @@ rightmost_no_delete:
* Inline extents. This is trivially handled, so do
* it up front.
*/
- ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
- path);
+ ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
if (ret)
mlog_errno(ret);
goto out;
@@ -3044,7 +3182,7 @@ rightmost_no_delete:
*
* 1) is handled via ocfs2_rotate_rightmost_leaf_left()
* 2a) we need the left branch so that we can update it with the unlink
- * 2b) we need to bring the inode back to inline extents.
+ * 2b) we need to bring the root back to inline extents.
*/
eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
@@ -3060,9 +3198,9 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
- ocfs2_error(inode->i_sb,
- "Inode %llu has empty extent block at %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has empty extent block at %llu",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
}
@@ -3076,8 +3214,8 @@ rightmost_no_delete:
* nonempty list.
*/
- ret = ocfs2_remove_rightmost_path(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_remove_rightmost_path(handle, et, path,
+ dealloc);
if (ret)
mlog_errno(ret);
goto out;
@@ -3088,8 +3226,8 @@ rightmost_no_delete:
* and restarting from there.
*/
try_rotate:
- ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
- dealloc, &restart_path, et);
+ ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
+ dealloc, &restart_path);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -3099,9 +3237,9 @@ try_rotate:
tmp_path = restart_path;
restart_path = NULL;
- ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+ ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
tmp_path, dealloc,
- &restart_path, et);
+ &restart_path);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -3152,7 +3290,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
}
}
-static int ocfs2_get_right_path(struct inode *inode,
+static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path **ret_right_path)
{
@@ -3169,8 +3307,8 @@ static int ocfs2_get_right_path(struct inode *inode,
left_el = path_leaf_el(left_path);
BUG_ON(left_el->l_next_free_rec != left_el->l_count);
- ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
- &right_cpos);
+ ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ left_path, &right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3186,7 +3324,7 @@ static int ocfs2_get_right_path(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, right_path, right_cpos);
+ ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3206,9 +3344,9 @@ out:
* For index == l_count - 1, the "next" means the 1st extent rec of the
* next extent block.
*/
-static int ocfs2_merge_rec_right(struct inode *inode,
- struct ocfs2_path *left_path,
+static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *split_rec,
int index)
{
@@ -3229,7 +3367,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
/* we meet with a cross extent block merge. */
- ret = ocfs2_get_right_path(inode, left_path, &right_path);
+ ret = ocfs2_get_right_path(et, left_path, &right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3248,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
le16_to_cpu(left_rec->e_leaf_clusters) !=
le32_to_cpu(right_rec->e_cpos));
- subtree_index = ocfs2_find_subtree_root(inode,
- left_path, right_path);
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
handle->h_buffer_credits,
@@ -3262,7 +3400,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -3271,14 +3409,14 @@ static int ocfs2_merge_rec_right(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -3291,7 +3429,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
right_rec = &el->l_recs[index + 1];
}
- ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
path_num_items(left_path) - 1);
if (ret) {
mlog_errno(ret);
@@ -3302,22 +3440,17 @@ static int ocfs2_merge_rec_right(struct inode *inode,
le32_add_cpu(&right_rec->e_cpos, -split_clusters);
le64_add_cpu(&right_rec->e_blkno,
- -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+ -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+ split_clusters));
le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
ocfs2_cleanup_merge(el, index);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
if (right_path) {
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
- if (ret)
- mlog_errno(ret);
-
- ocfs2_complete_edge_insert(inode, handle, left_path,
- right_path, subtree_index);
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
}
out:
if (right_path)
@@ -3325,7 +3458,7 @@ out:
return ret;
}
-static int ocfs2_get_left_path(struct inode *inode,
+static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
@@ -3338,7 +3471,7 @@ static int ocfs2_get_left_path(struct inode *inode,
/* This function shouldn't be called for non-trees. */
BUG_ON(right_path->p_tree_depth == 0);
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
right_path, &left_cpos);
if (ret) {
mlog_errno(ret);
@@ -3355,7 +3488,7 @@ static int ocfs2_get_left_path(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, left_cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3378,12 +3511,11 @@ out:
* remove the rightmost leaf extent block in the right_path and change
* the right path to indicate the new rightmost path.
*/
-static int ocfs2_merge_rec_left(struct inode *inode,
- struct ocfs2_path *right_path,
+static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_extent_tree *et,
int index)
{
int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3401,7 +3533,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
right_rec = &el->l_recs[index];
if (index == 0) {
/* we meet with a cross extent block merge. */
- ret = ocfs2_get_left_path(inode, right_path, &left_path);
+ ret = ocfs2_get_left_path(et, right_path, &left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3417,8 +3549,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
le16_to_cpu(left_rec->e_leaf_clusters) !=
le32_to_cpu(split_rec->e_cpos));
- subtree_index = ocfs2_find_subtree_root(inode,
- left_path, right_path);
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
+ right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
handle->h_buffer_credits,
@@ -3431,7 +3563,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
subtree_index);
if (ret) {
mlog_errno(ret);
@@ -3440,14 +3572,14 @@ static int ocfs2_merge_rec_left(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_path_bh_journal_access(handle, inode,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
left_path, i);
if (ret) {
mlog_errno(ret);
@@ -3460,7 +3592,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
has_empty_extent = 1;
}
- ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
path_num_items(right_path) - 1);
if (ret) {
mlog_errno(ret);
@@ -3479,19 +3611,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
le32_add_cpu(&right_rec->e_cpos, split_clusters);
le64_add_cpu(&right_rec->e_blkno,
- ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+ ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
+ split_clusters));
le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
ocfs2_cleanup_merge(el, index);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
if (left_path) {
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
/*
* In the situation that the right_rec is empty and the extent
@@ -3501,9 +3629,9 @@ static int ocfs2_merge_rec_left(struct inode *inode,
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
- ret = ocfs2_remove_rightmost_path(inode, handle,
+ ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
- dealloc, et);
+ dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3515,7 +3643,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
ocfs2_mv_path(right_path, left_path);
left_path = NULL;
} else
- ocfs2_complete_edge_insert(inode, handle, left_path,
+ ocfs2_complete_edge_insert(handle, left_path,
right_path, subtree_index);
}
out:
@@ -3524,15 +3652,13 @@ out:
return ret;
}
-static int ocfs2_try_to_merge_extent(struct inode *inode,
- handle_t *handle,
+static int ocfs2_try_to_merge_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
int split_index,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_merge_ctxt *ctxt,
- struct ocfs2_extent_tree *et)
-
+ struct ocfs2_merge_ctxt *ctxt)
{
int ret = 0;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -3548,8 +3674,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* extents - having more than one in a leaf is
* illegal.
*/
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3578,8 +3703,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* prevoius extent block. It is more efficient and easier
* if we do merge_right first and merge_left later.
*/
- ret = ocfs2_merge_rec_right(inode, path,
- handle, split_rec,
+ ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
split_index);
if (ret) {
mlog_errno(ret);
@@ -3592,8 +3716,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
/* The merge left us with an empty extent, remove it. */
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3605,18 +3728,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* Note that we don't pass split_rec here on purpose -
* we've merged it into the rec already.
*/
- ret = ocfs2_merge_rec_left(inode, path,
- handle, rec,
- dealloc, et,
- split_index);
+ ret = ocfs2_merge_rec_left(path, handle, et, rec,
+ dealloc, split_index);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
* print but don't bubble it up.
@@ -3633,19 +3753,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* the record on the left (hence the left merge).
*/
if (ctxt->c_contig_type == CONTIG_RIGHT) {
- ret = ocfs2_merge_rec_left(inode,
- path,
- handle, split_rec,
- dealloc, et,
+ ret = ocfs2_merge_rec_left(path, handle, et,
+ split_rec, dealloc,
split_index);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
- ret = ocfs2_merge_rec_right(inode,
- path,
- handle, split_rec,
+ ret = ocfs2_merge_rec_right(path, handle,
+ et, split_rec,
split_index);
if (ret) {
mlog_errno(ret);
@@ -3658,8 +3775,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
*/
- ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path,
+ dealloc);
if (ret)
mlog_errno(ret);
ret = 0;
@@ -3705,10 +3822,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb,
* list. If this leaf is part of an allocation tree, it is assumed
* that the tree above has been prepared.
*/
-static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *insert_rec,
struct ocfs2_extent_list *el,
- struct ocfs2_insert_type *insert,
- struct inode *inode)
+ struct ocfs2_insert_type *insert)
{
int i = insert->ins_contig_index;
unsigned int range;
@@ -3720,7 +3837,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
BUG_ON(i == -1);
rec = &el->l_recs[i];
- ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ insert->ins_split, rec,
insert_rec);
goto rotate;
}
@@ -3762,10 +3880,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
le16_to_cpu(el->l_count),
- "inode %lu, depth %u, count %u, next free %u, "
+ "owner %llu, depth %u, count %u, next free %u, "
"rec.cpos %u, rec.clusters %u, "
"insert.cpos %u, insert.clusters %u\n",
- inode->i_ino,
+ ocfs2_metadata_cache_owner(et->et_ci),
le16_to_cpu(el->l_tree_depth),
le16_to_cpu(el->l_count),
le16_to_cpu(el->l_next_free_rec),
@@ -3793,8 +3911,8 @@ rotate:
ocfs2_rotate_leaf(el, insert_rec);
}
-static void ocfs2_adjust_rightmost_records(struct inode *inode,
- handle_t *handle,
+static void ocfs2_adjust_rightmost_records(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_rec *insert_rec)
{
@@ -3812,9 +3930,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has a bad extent list",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
}
@@ -3827,14 +3945,12 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
le32_add_cpu(&rec->e_int_clusters,
-le32_to_cpu(rec->e_cpos));
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
}
}
-static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+static int ocfs2_append_rec_to_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
@@ -3862,16 +3978,18 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
(next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
u32 left_cpos;
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
- &left_cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
+ right_path, &left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
- mlog(0, "Append may need a left path update. cpos: %u, "
- "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
- left_cpos);
+ trace_ocfs2_append_rec_to_path(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ le32_to_cpu(insert_rec->e_cpos),
+ left_cpos);
/*
* No need to worry if the append is already in the
@@ -3885,7 +4003,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, left_cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3898,13 +4017,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
}
}
- ret = ocfs2_journal_access_path(inode, handle, right_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
if (ret) {
mlog_errno(ret);
goto out;
}
- ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
+ ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
*ret_left_path = left_path;
ret = 0;
@@ -3915,7 +4034,7 @@ out:
return ret;
}
-static void ocfs2_split_record(struct inode *inode,
+static void ocfs2_split_record(struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
struct ocfs2_extent_rec *split_rec,
@@ -3988,7 +4107,8 @@ static void ocfs2_split_record(struct inode *inode,
}
rec = &el->l_recs[index];
- ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+ ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ split, rec, split_rec);
ocfs2_rotate_leaf(insert_el, split_rec);
}
@@ -4000,8 +4120,8 @@ static void ocfs2_split_record(struct inode *inode,
* in. left_path should only be passed in if we need to update that
* portion of the tree after an edge insert.
*/
-static int ocfs2_insert_path(struct inode *inode,
- handle_t *handle,
+static int ocfs2_insert_path(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *left_path,
struct ocfs2_path *right_path,
struct ocfs2_extent_rec *insert_rec,
@@ -4011,23 +4131,19 @@ static int ocfs2_insert_path(struct inode *inode,
struct buffer_head *leaf_bh = path_leaf_bh(right_path);
if (left_path) {
- int credits = handle->h_buffer_credits;
-
/*
* There's a chance that left_path got passed back to
* us without being accounted for in the
* journal. Extend our transaction here to be sure we
* can change those blocks.
*/
- credits += left_path->p_tree_depth;
-
- ret = ocfs2_extend_trans(handle, credits);
+ ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, left_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4038,7 +4154,7 @@ static int ocfs2_insert_path(struct inode *inode,
* Pass both paths to the journal. The majority of inserts
* will be touching all components anyway.
*/
- ret = ocfs2_journal_access_path(inode, handle, right_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4050,7 +4166,7 @@ static int ocfs2_insert_path(struct inode *inode,
* of splits, but it's easier to just let one separate
* function sort it all out.
*/
- ocfs2_split_record(inode, left_path, right_path,
+ ocfs2_split_record(et, left_path, right_path,
insert_rec, insert->ins_split);
/*
@@ -4059,17 +4175,13 @@ static int ocfs2_insert_path(struct inode *inode,
* dirty this for us.
*/
if (left_path)
- ret = ocfs2_journal_dirty(handle,
- path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle,
+ path_leaf_bh(left_path));
} else
- ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
- insert, inode);
+ ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
+ insert);
- ret = ocfs2_journal_dirty(handle, leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, leaf_bh);
if (left_path) {
/*
@@ -4078,10 +4190,10 @@ static int ocfs2_insert_path(struct inode *inode,
*
* XXX: Should we extend the transaction here?
*/
- subtree_index = ocfs2_find_subtree_root(inode, left_path,
+ subtree_index = ocfs2_find_subtree_root(et, left_path,
right_path);
- ocfs2_complete_edge_insert(inode, handle, left_path,
- right_path, subtree_index);
+ ocfs2_complete_edge_insert(handle, left_path, right_path,
+ subtree_index);
}
ret = 0;
@@ -4089,8 +4201,7 @@ out:
return ret;
}
-static int ocfs2_do_insert_extent(struct inode *inode,
- handle_t *handle,
+static int ocfs2_do_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *type)
@@ -4103,7 +4214,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
el = et->et_root_el;
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4111,7 +4222,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
if (le16_to_cpu(el->l_tree_depth) == 0) {
- ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+ ocfs2_insert_at_leaf(et, insert_rec, el, type);
goto out_update_clusters;
}
@@ -4134,7 +4245,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
cpos = UINT_MAX;
}
- ret = ocfs2_find_path(inode, right_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, right_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4153,7 +4264,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* can wind up skipping both of these two special cases...
*/
if (rotate) {
- ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
+ ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
le32_to_cpu(insert_rec->e_cpos),
right_path, &left_path);
if (ret) {
@@ -4165,7 +4276,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* ocfs2_rotate_tree_right() might have extended the
* transaction without re-journaling our tree root.
*/
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4173,7 +4284,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
} else if (type->ins_appending == APPEND_TAIL
&& type->ins_contig != CONTIG_LEFT) {
- ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+ ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
right_path, &left_path);
if (ret) {
mlog_errno(ret);
@@ -4181,7 +4292,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
}
- ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+ ret = ocfs2_insert_path(handle, et, left_path, right_path,
insert_rec, type);
if (ret) {
mlog_errno(ret);
@@ -4190,12 +4301,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
out_update_clusters:
if (type->ins_split == SPLIT_NONE)
- ocfs2_et_update_clusters(inode, et,
+ ocfs2_et_update_clusters(et,
le16_to_cpu(insert_rec->e_leaf_clusters));
- ret = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
out:
ocfs2_free_path(left_path);
@@ -4205,7 +4314,8 @@ out:
}
static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
+ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
struct ocfs2_extent_list *el, int index,
struct ocfs2_extent_rec *split_rec)
{
@@ -4217,12 +4327,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
struct ocfs2_path *left_path = NULL, *right_path = NULL;
struct buffer_head *bh;
struct ocfs2_extent_block *eb;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
if (index > 0) {
rec = &el->l_recs[index - 1];
} else if (path->p_tree_depth > 0) {
- status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
- path, &left_cpos);
+ status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (status)
goto out;
@@ -4231,7 +4341,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (!left_path)
goto out;
- status = ocfs2_find_path(inode, left_path, left_cpos);
+ status = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
if (status)
goto out;
@@ -4241,7 +4352,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(inode->i_sb,
+ ocfs2_error(sb,
"Extent block #%llu has an "
"invalid l_next_free_rec of "
"%d. It should have "
@@ -4266,7 +4377,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (split_rec->e_cpos == el->l_recs[index].e_cpos)
ret = CONTIG_RIGHT;
} else {
- ret = ocfs2_extent_contig(inode, rec, split_rec);
+ ret = ocfs2_et_extent_contig(et, rec, split_rec);
}
}
@@ -4275,8 +4386,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
rec = &el->l_recs[index + 1];
else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
path->p_tree_depth > 0) {
- status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
- path, &right_cpos);
+ status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (status)
goto out;
@@ -4287,7 +4397,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (!right_path)
goto out;
- status = ocfs2_find_path(inode, right_path, right_cpos);
+ status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (status)
goto out;
@@ -4297,7 +4407,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- ocfs2_error(inode->i_sb,
+ ocfs2_error(sb,
"Extent block #%llu has an "
"invalid l_next_free_rec of %d",
(unsigned long long)le64_to_cpu(eb->h_blkno),
@@ -4312,7 +4422,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (rec) {
enum ocfs2_contig_type contig_type;
- contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+ contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
ret = CONTIG_LEFTRIGHT;
@@ -4329,11 +4439,10 @@ out:
return ret;
}
-static void ocfs2_figure_contig_type(struct inode *inode,
+static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
- struct ocfs2_extent_rec *insert_rec,
- struct ocfs2_extent_tree *et)
+ struct ocfs2_extent_rec *insert_rec)
{
int i;
enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -4341,8 +4450,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
- insert_rec);
+ contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+ insert_rec);
if (contig_type != CONTIG_NONE) {
insert->ins_contig_index = i;
break;
@@ -4411,7 +4520,7 @@ set_tail_append:
}
/*
- * Helper function called at the begining of an insert.
+ * Helper function called at the beginning of an insert.
*
* This computes a few things that are commonly used in the process of
* inserting into the btree:
@@ -4423,8 +4532,7 @@ set_tail_append:
* All of the information is stored on the ocfs2_insert_type
* structure.
*/
-static int ocfs2_figure_insert_type(struct inode *inode,
- struct ocfs2_extent_tree *et,
+static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
struct buffer_head **last_eb_bh,
struct ocfs2_extent_rec *insert_rec,
int *free_records,
@@ -4448,11 +4556,11 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&bh);
if (ret) {
- mlog_exit(ret);
+ mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
@@ -4471,7 +4579,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
le16_to_cpu(el->l_next_free_rec);
if (!insert->ins_tree_depth) {
- ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
ocfs2_figure_appending_type(insert, el, insert_rec);
return 0;
}
@@ -4489,7 +4597,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* us the rightmost tree path. This is accounted for below in
* the appending code.
*/
- ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+ ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
if (ret) {
mlog_errno(ret);
goto out;
@@ -4505,7 +4613,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* into two types of appends: simple record append, or a
* rotate inside the tail leaf.
*/
- ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
+ ocfs2_figure_contig_type(et, insert, el, insert_rec);
/*
* The insert code isn't quite ready to deal with all cases of
@@ -4550,13 +4658,11 @@ out:
}
/*
- * Insert an extent into an inode btree.
+ * Insert an extent into a btree.
*
- * The caller needs to update fe->i_clusters
+ * The caller needs to update the owning btree's cluster count.
*/
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
@@ -4570,35 +4676,34 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
- mlog(0, "add %u clusters at position %u to inode %llu\n",
- new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_insert_extent_start(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, new_clusters);
memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
rec.e_leaf_clusters = cpu_to_le16(new_clusters);
rec.e_flags = flags;
- status = ocfs2_et_insert_check(inode, et, &rec);
+ status = ocfs2_et_insert_check(et, &rec);
if (status) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
+ status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
&free_records, &insert);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
- "Insert.contig_index: %d, Insert.free_records: %d, "
- "Insert.tree_depth: %d\n",
- insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
- free_records, insert.ins_tree_depth);
+ trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+ insert.ins_contig_index, free_records,
+ insert.ins_tree_depth);
if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
- status = ocfs2_grow_tree(inode, handle, et,
+ status = ocfs2_grow_tree(handle, et,
&insert.ins_tree_depth, &last_eb_bh,
meta_ac);
if (status) {
@@ -4608,16 +4713,15 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
}
/* Finally, we can add clusters. This might rotate the tree for us. */
- status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
+ status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
if (status < 0)
mlog_errno(status);
- else if (et->et_ops == &ocfs2_dinode_et_ops)
- ocfs2_extent_map_insert_rec(inode, &rec);
+ else
+ ocfs2_et_extent_map_insert(et, &rec);
bail:
brelse(last_eb_bh);
- mlog_exit(status);
return status;
}
@@ -4628,30 +4732,31 @@ bail:
* it is not limited to the file storage. Any extent tree can use this
* function if it implements the proper ocfs2_extent_tree.
*/
-int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
- struct inode *inode,
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
u32 *logical_offset,
u32 clusters_to_add,
int mark_unwritten,
- struct ocfs2_extent_tree *et,
- handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int status = 0;
+ int status = 0, err = 0;
+ int need_free = 0;
int free_extents;
enum ocfs2_alloc_restarted reason = RESTART_NONE;
u32 bit_off, num_bits;
u64 block;
u8 flags = 0;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
BUG_ON(!clusters_to_add);
if (mark_unwritten)
flags = OCFS2_EXT_UNWRITTEN;
- free_extents = ocfs2_num_free_extents(osb, inode, et);
+ free_extents = ocfs2_num_free_extents(osb, et);
if (free_extents < 0) {
status = free_extents;
mlog_errno(status);
@@ -4664,20 +4769,20 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
* 2) we are so fragmented, we've needed to add metadata too
* many times. */
if (!free_extents && !meta_ac) {
- mlog(0, "we haven't reserved any metadata!\n");
+ err = -1;
status = -EAGAIN;
reason = RESTART_META;
goto leave;
} else if ((!free_extents)
&& (ocfs2_alloc_context_bits_left(meta_ac)
< ocfs2_extend_meta_needed(et->et_root_el))) {
- mlog(0, "filesystem is really fragmented...\n");
+ err = -2;
status = -EAGAIN;
reason = RESTART_META;
goto leave;
}
- status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ status = __ocfs2_claim_clusters(handle, data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (status < 0) {
if (status != -ENOSPC)
@@ -4688,44 +4793,54 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
BUG_ON(num_bits > clusters_to_add);
/* reserve our write early -- insert_extent may update the tree root */
- status = ocfs2_et_root_journal_access(handle, inode, et,
+ status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_insert_extent(osb, handle, inode, et,
- *logical_offset, block,
+ trace_ocfs2_add_clusters_in_btree(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ bit_off, num_bits);
+ status = ocfs2_insert_extent(handle, et, *logical_offset, block,
num_bits, flags, meta_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
clusters_to_add -= num_bits;
*logical_offset += num_bits;
if (clusters_to_add) {
- mlog(0, "need to alloc once more, wanted = %u\n",
- clusters_to_add);
+ err = clusters_to_add;
status = -EAGAIN;
reason = RESTART_TRANS;
}
+bail:
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num_bits);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num_bits);
+ }
+
leave:
- mlog_exit(status);
if (reason_ret)
*reason_ret = reason;
+ trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
return status;
}
@@ -4749,10 +4864,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
split_rec->e_flags = rec->e_flags;
}
-static int ocfs2_split_and_insert(struct inode *inode,
- handle_t *handle,
- struct ocfs2_path *path,
+static int ocfs2_split_and_insert(handle_t *handle,
struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
struct buffer_head **last_eb_bh,
int split_index,
struct ocfs2_extent_rec *orig_split_rec,
@@ -4785,7 +4899,7 @@ leftright:
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, et,
+ ret = ocfs2_grow_tree(handle, et,
&depth, last_eb_bh, meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4814,8 +4928,8 @@ leftright:
*/
insert.ins_split = SPLIT_RIGHT;
- ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
- &rec);
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &tmprec, insert_range, &rec);
split_rec = tmprec;
@@ -4823,7 +4937,7 @@ leftright:
do_leftright = 1;
}
- ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4839,7 +4953,7 @@ leftright:
ocfs2_reinit_path(path, 1);
cpos = le32_to_cpu(split_rec.e_cpos);
- ret = ocfs2_find_path(inode, path, cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4854,8 +4968,8 @@ out:
return ret;
}
-static int ocfs2_replace_extent_rec(struct inode *inode,
- handle_t *handle,
+static int ocfs2_replace_extent_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_list *el,
int split_index,
@@ -4863,7 +4977,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode,
{
int ret;
- ret = ocfs2_path_bh_journal_access(handle, inode, path,
+ ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
path_num_items(path) - 1);
if (ret) {
mlog_errno(ret);
@@ -4878,9 +4992,8 @@ out:
}
/*
- * Mark part or all of the extent record at split_index in the leaf
- * pointed to by path as written. This removes the unwritten
- * extent flag.
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
*
* Care is taken to handle contiguousness so as to not grow the tree.
*
@@ -4897,14 +5010,13 @@ out:
* have been brought into cache (and pinned via the journal), so the
* extra overhead is not expressed in terms of disk reads.
*/
-static int __ocfs2_mark_extent_written(struct inode *inode,
- struct ocfs2_extent_tree *et,
- handle_t *handle,
- struct ocfs2_path *path,
- int split_index,
- struct ocfs2_extent_rec *split_rec,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret = 0;
struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -4913,12 +5025,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
struct ocfs2_merge_ctxt ctxt;
struct ocfs2_extent_list *rightmost_el;
- if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = -EIO;
- mlog_errno(ret);
- goto out;
- }
-
if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
(le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -4927,23 +5033,23 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
goto out;
}
- ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
+ ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
split_index,
split_rec);
/*
* The core merge / split code wants to know how much room is
- * left in this inodes allocation tree, so we pass the
+ * left in this allocation tree, so we pass the
* rightmost extent list.
*/
if (path->p_tree_depth) {
struct ocfs2_extent_block *eb;
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
if (ret) {
- mlog_exit(ret);
+ mlog_errno(ret);
goto out;
}
@@ -4960,25 +5066,24 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
- mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
- split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
- ctxt.c_split_covers_rec);
+ trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+ ctxt.c_has_empty_extent,
+ ctxt.c_split_covers_rec);
if (ctxt.c_contig_type == CONTIG_NONE) {
if (ctxt.c_split_covers_rec)
- ret = ocfs2_replace_extent_rec(inode, handle,
- path, el,
+ ret = ocfs2_replace_extent_rec(handle, et, path, el,
split_index, split_rec);
else
- ret = ocfs2_split_and_insert(inode, handle, path, et,
+ ret = ocfs2_split_and_insert(handle, et, path,
&last_eb_bh, split_index,
split_rec, meta_ac);
if (ret)
mlog_errno(ret);
} else {
- ret = ocfs2_try_to_merge_extent(inode, handle, path,
+ ret = ocfs2_try_to_merge_extent(handle, et, path,
split_index, split_rec,
- dealloc, &ctxt, et);
+ dealloc, &ctxt);
if (ret)
mlog_errno(ret);
}
@@ -4989,46 +5094,31 @@ out:
}
/*
- * Mark the already-existing extent at cpos as written for len clusters.
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
*
* If the existing extent is larger than the request, initiate a
* split. An attempt will be made at merging with adjacent extents.
*
* The caller is responsible for passing down meta_ac if we'll need it.
*/
-int ocfs2_mark_extent_written(struct inode *inode,
- struct ocfs2_extent_tree *et,
- handle_t *handle, u32 cpos, u32 len, u32 phys,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags)
{
int ret, index;
- u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
struct ocfs2_extent_rec split_rec;
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el;
-
- mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
- inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
-
- if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- ret = -EROFS;
- goto out;
- }
-
- /*
- * XXX: This should be fixed up so that we just re-insert the
- * next extent records.
- *
- * XXX: This is a hack on the extent tree, maybe it should be
- * an op?
- */
- if (et->et_ops == &ocfs2_dinode_et_ops)
- ocfs2_extent_map_trunc(inode, 0);
+ struct ocfs2_extent_rec *rec;
left_path = ocfs2_new_path_from_et(et);
if (!left_path) {
@@ -5037,7 +5127,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5046,38 +5136,107 @@ int ocfs2_mark_extent_written(struct inode *inode,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
+ ocfs2_error(sb,
+ "Owner %llu has an extent at cpos %u which can no "
"longer be found.\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci), cpos);
ret = -EROFS;
goto out;
}
+ ret = -EIO;
+ rec = &el->l_recs[index];
+ if (new_flags && (rec->e_flags & new_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+ "extent that already had them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ new_flags);
+ goto out;
+ }
+
+ if (clear_flags && !(rec->e_flags & clear_flags)) {
+ mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+ "extent that didn't have them",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ clear_flags);
+ goto out;
+ }
+
memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
split_rec.e_cpos = cpu_to_le32(cpos);
split_rec.e_leaf_clusters = cpu_to_le16(len);
split_rec.e_blkno = cpu_to_le64(start_blkno);
- split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
- split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
-
- ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
- index, &split_rec, meta_ac,
- dealloc);
+ split_rec.e_flags = rec->e_flags;
+ if (new_flags)
+ split_rec.e_flags |= new_flags;
+ if (clear_flags)
+ split_rec.e_flags &= ~clear_flags;
+
+ ret = ocfs2_split_extent(handle, et, left_path,
+ index, &split_rec, meta_ac,
+ dealloc);
if (ret)
mlog_errno(ret);
out:
ocfs2_free_path(left_path);
return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ trace_ocfs2_mark_extent_written(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ cpos, len, phys);
+
+ if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+ "that are being written to, but the feature bit "
+ "is not set in the super block.",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * XXX: This should be fixed up so that we just re-insert the
+ * next extent records.
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ 0, OCFS2_EXT_UNWRITTEN);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
}
-static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
- handle_t *handle, struct ocfs2_path *path,
+static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
int index, u32 new_range,
struct ocfs2_alloc_context *meta_ac)
{
- int ret, depth, credits = handle->h_buffer_credits;
+ int ret, depth, credits;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *rightmost_el, *el;
@@ -5090,11 +5249,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
*/
el = path_leaf_el(path);
rec = &el->l_recs[index];
- ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+ ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
+ &split_rec, new_range, rec);
depth = path->p_tree_depth;
if (depth > 0) {
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(et->et_ci,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
if (ret < 0) {
@@ -5107,8 +5267,8 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
} else
rightmost_el = path_leaf_el(path);
- credits += path->p_tree_depth +
- ocfs2_extend_meta_needed(et->et_root_el);
+ credits = path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -5117,7 +5277,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
+ ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
meta_ac);
if (ret) {
mlog_errno(ret);
@@ -5131,7 +5291,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
insert.ins_split = SPLIT_RIGHT;
insert.ins_tree_depth = depth;
- ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
+ ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
if (ret)
mlog_errno(ret);
@@ -5140,23 +5300,23 @@ out:
return ret;
}
-static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+static int ocfs2_truncate_rec(handle_t *handle,
+ struct ocfs2_extent_tree *et,
struct ocfs2_path *path, int index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u32 cpos, u32 len,
- struct ocfs2_extent_tree *et)
+ u32 cpos, u32 len)
{
int ret;
u32 left_cpos, rec_range, trunc_range;
int wants_rotate = 0, is_rightmost_tree_rec = 0;
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
struct ocfs2_path *left_path = NULL;
struct ocfs2_extent_list *el = path_leaf_el(path);
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5188,14 +5348,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
* by this leaf and the one to it's left.
*
* There are two cases we can skip:
- * 1) Path is the leftmost one in our inode tree.
+ * 1) Path is the leftmost one in our btree.
* 2) The leaf is rightmost and will be empty after
* we remove the extent record - the rotate code
* knows how to update the newly formed edge.
*/
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
- &left_cpos);
+ ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5209,7 +5368,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_find_path(inode, left_path, left_cpos);
+ ret = ocfs2_find_path(et->et_ci, left_path,
+ left_cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5225,13 +5385,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_path(inode, handle, left_path);
+ ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5254,7 +5414,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
* be deleted by the rotate code.
*/
rec = &el->l_recs[next_free - 1];
- ocfs2_adjust_rightmost_records(inode, handle, path,
+ ocfs2_adjust_rightmost_records(handle, et, path,
rec);
}
} else if (le32_to_cpu(rec->e_cpos) == cpos) {
@@ -5266,11 +5426,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
/* Remove rightmost portion of the record */
le16_add_cpu(&rec->e_leaf_clusters, -len);
if (is_rightmost_tree_rec)
- ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+ ocfs2_adjust_rightmost_records(handle, et, path, rec);
} else {
/* Caller should have trapped this. */
- mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
- "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
+ "(%u, %u)\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
le32_to_cpu(rec->e_cpos),
le16_to_cpu(rec->e_leaf_clusters), cpos, len);
BUG();
@@ -5279,14 +5440,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
if (left_path) {
int subtree_index;
- subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
- ocfs2_complete_edge_insert(inode, handle, left_path, path,
+ subtree_index = ocfs2_find_subtree_root(et, left_path, path);
+ ocfs2_complete_edge_insert(handle, left_path, path,
subtree_index);
}
ocfs2_journal_dirty(handle, path_leaf_bh(path));
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
+ ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5297,9 +5458,9 @@ out:
return ret;
}
-int ocfs2_remove_extent(struct inode *inode,
+int ocfs2_remove_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 len, handle_t *handle,
+ u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
@@ -5309,7 +5470,11 @@ int ocfs2_remove_extent(struct inode *inode,
struct ocfs2_extent_list *el;
struct ocfs2_path *path = NULL;
- ocfs2_extent_map_trunc(inode, 0);
+ /*
+ * XXX: Why are we truncating to 0 instead of wherever this
+ * affects us?
+ */
+ ocfs2_et_extent_map_truncate(et, 0);
path = ocfs2_new_path_from_et(et);
if (!path) {
@@ -5318,7 +5483,7 @@ int ocfs2_remove_extent(struct inode *inode,
goto out;
}
- ret = ocfs2_find_path(inode, path, cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5327,10 +5492,11 @@ int ocfs2_remove_extent(struct inode *inode,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu has an extent at cpos %u which can no "
"longer be found.\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5357,20 +5523,20 @@ int ocfs2_remove_extent(struct inode *inode,
BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
- mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
- "(cpos %u, len %u)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
- le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+ trace_ocfs2_remove_extent(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, index, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
- ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len, et);
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
- ret = ocfs2_split_tree(inode, et, handle, path, index,
+ ret = ocfs2_split_tree(handle, et, path, index,
trunc_range, meta_ac);
if (ret) {
mlog_errno(ret);
@@ -5383,7 +5549,7 @@ int ocfs2_remove_extent(struct inode *inode,
*/
ocfs2_reinit_path(path, 1);
- ret = ocfs2_find_path(inode, path, cpos);
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5392,9 +5558,9 @@ int ocfs2_remove_extent(struct inode *inode,
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
- ocfs2_error(inode->i_sb,
- "Inode %llu: split at cpos %u lost record.",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: split at cpos %u lost record.",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
goto out;
@@ -5408,18 +5574,18 @@ int ocfs2_remove_extent(struct inode *inode,
rec_range = le32_to_cpu(rec->e_cpos) +
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
- ocfs2_error(inode->i_sb,
- "Inode %llu: error after split at cpos %u"
+ ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+ "Owner %llu: error after split at cpos %u"
"trunc len %u, existing record is (%u,%u)",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
- ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len, et);
+ ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
+ cpos, len);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5431,22 +5597,100 @@ out:
return ret;
}
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **ac,
+ int extra_blocks)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *ac = NULL;
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ if (extra_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ if (ret) {
+ if (*ac) {
+ ocfs2_free_alloc_context(*ac);
+ *ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc)
{
- int ret;
+ int ret, credits = 0, extra_blocks = 0;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
handle_t *handle;
struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+
+ if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
- ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+ }
+
+ ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+ extra_blocks);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
mutex_lock(&tl_inode->i_mutex);
@@ -5459,50 +5703,61 @@ int ocfs2_remove_btree_range(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ handle = ocfs2_start_trans(osb,
+ ocfs2_remove_extent_credits(osb->sb) + credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
- ret = ocfs2_et_root_journal_access(handle, inode, et,
+ ret = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(inode->i_sb, len));
- ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
- dealloc);
+ ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
- ocfs2_et_update_clusters(inode, et, -len);
+ ocfs2_et_update_clusters(et, -len);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ret = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
- if (ret)
- mlog_errno(ret);
+ if (phys_blkno) {
+ if (flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ phys_blkno),
+ len, meta_ac,
+ dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+ }
out_commit:
ocfs2_commit_trans(osb, handle);
out:
mutex_unlock(&tl_inode->i_mutex);
-
+bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
return ret;
}
@@ -5551,9 +5806,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- mlog_entry("start_blk = %llu, num_clusters = %u\n",
- (unsigned long long)start_blk, num_clusters);
-
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5583,17 +5835,16 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Log truncate of %u clusters starting at cluster %u to "
- "%llu (index = %d)\n", num_clusters, start_cluster,
- (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
-
+ trace_ocfs2_truncate_log_append(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+ start_cluster, num_clusters);
if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
/*
* Move index back to the record we are coalescing with.
@@ -5602,23 +5853,20 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
index--;
num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
- mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
- index, le32_to_cpu(tl->tl_recs[index].t_start),
- num_clusters);
+ trace_ocfs2_truncate_log_append(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ index, le32_to_cpu(tl->tl_recs[index].t_start),
+ num_clusters);
} else {
tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
tl->tl_used = cpu_to_le16(index + 1);
}
tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
+ osb->truncated_clusters += num_clusters;
bail:
- mlog_exit(status);
return status;
}
@@ -5637,15 +5885,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
struct inode *tl_inode = osb->osb_tl_inode;
struct buffer_head *tl_bh = osb->osb_tl_bh;
- mlog_entry_void();
-
di = (struct ocfs2_dinode *) tl_bh->b_data;
tl = &di->id2.i_dealloc;
i = le16_to_cpu(tl->tl_used) - 1;
while (i >= 0) {
/* Caller has given us at least enough credits to
* update the truncate log dinode */
- status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -5654,11 +5900,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
tl->tl_used = cpu_to_le16(i);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
/* TODO: Perhaps we can calculate the bulk of the
* credits up front rather than extending like
@@ -5678,8 +5920,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
/* if start_blk is not set, we ignore the record as
* invalid. */
if (start_blk) {
- mlog(0, "free record %d, start = %u, clusters = %u\n",
- i, le32_to_cpu(rec.t_start), num_clusters);
+ trace_ocfs2_replay_truncate_records(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ i, le32_to_cpu(rec.t_start), num_clusters);
status = ocfs2_free_clusters(handle, data_alloc_inode,
data_alloc_bh, start_blk,
@@ -5692,8 +5935,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
i--;
}
+ osb->truncated_clusters = 0;
+
bail:
- mlog_exit(status);
return status;
}
@@ -5710,8 +5954,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- mlog_entry_void();
-
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -5723,8 +5965,9 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
tl = &di->id2.i_dealloc;
num_to_flush = le16_to_cpu(tl->tl_used);
- mlog(0, "Flush %u records from truncate log #%llu\n",
- num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+ trace_ocfs2_flush_truncate_log(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ num_to_flush);
if (!num_to_flush) {
status = 0;
goto out;
@@ -5770,7 +6013,6 @@ out_mutex:
iput(data_alloc_inode);
out:
- mlog_exit(status);
return status;
}
@@ -5793,22 +6035,19 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
container_of(work, struct ocfs2_super,
osb_truncate_log_wq.work);
- mlog_entry_void();
-
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
mlog_errno(status);
else
- ocfs2_init_inode_steal_slot(osb);
-
- mlog_exit(status);
+ ocfs2_init_steal_slots(osb);
}
#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel)
{
- if (osb->osb_tl_inode) {
+ if (osb->osb_tl_inode &&
+ atomic_read(&osb->osb_tl_disable) == 0) {
/* We want to push off log flushes while truncates are
* still running. */
if (cancel)
@@ -5847,7 +6086,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
*tl_inode = inode;
*tl_bh = bh;
bail:
- mlog_exit(status);
return status;
}
@@ -5867,7 +6105,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
*tl_copy = NULL;
- mlog(0, "recover truncate log from slot %d\n", slot_num);
+ trace_ocfs2_begin_truncate_log_recovery(slot_num);
status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
if (status < 0) {
@@ -5884,8 +6122,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
tl = &di->id2.i_dealloc;
if (le16_to_cpu(tl->tl_used)) {
- mlog(0, "We'll have %u logs to recover\n",
- le16_to_cpu(tl->tl_used));
+ trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
if (!(*tl_copy)) {
@@ -5903,7 +6140,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
tl->tl_used = 0;
ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
- status = ocfs2_write_block(osb, tl_bh, tl_inode);
+ status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -5918,9 +6155,9 @@ bail:
if (status < 0 && (*tl_copy)) {
kfree(*tl_copy);
*tl_copy = NULL;
+ mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -5935,8 +6172,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_truncate_log *tl;
- mlog_entry_void();
-
if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
return -EINVAL;
@@ -5944,8 +6179,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
tl = &tl_copy->id2.i_dealloc;
num_recs = le16_to_cpu(tl->tl_used);
- mlog(0, "cleanup %u records from %llu\n", num_recs,
- (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
+ trace_ocfs2_complete_truncate_log_recovery(
+ (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+ num_recs);
mutex_lock(&tl_inode->i_mutex);
for(i = 0; i < num_recs; i++) {
@@ -5980,7 +6216,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
bail_up:
mutex_unlock(&tl_inode->i_mutex);
- mlog_exit(status);
return status;
}
@@ -5989,7 +6224,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
- mlog_entry_void();
+ atomic_set(&osb->osb_tl_disable, 1);
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
@@ -6002,8 +6237,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
brelse(osb->osb_tl_bh);
iput(osb->osb_tl_inode);
}
-
- mlog_exit_void();
}
int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -6012,8 +6245,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
struct inode *tl_inode = NULL;
struct buffer_head *tl_bh = NULL;
- mlog_entry_void();
-
status = ocfs2_get_truncate_log_info(osb,
osb->slot_num,
&tl_inode,
@@ -6026,10 +6257,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* until we're sure all is well. */
INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
ocfs2_truncate_log_worker);
+ atomic_set(&osb->osb_tl_disable, 0);
osb->osb_tl_bh = tl_bh;
osb->osb_tl_inode = tl_inode;
- mlog_exit(status);
return status;
}
@@ -6059,6 +6290,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
*/
struct ocfs2_cached_block_free {
struct ocfs2_cached_block_free *free_next;
+ u64 free_bg;
u64 free_blk;
unsigned int free_bit;
};
@@ -6105,10 +6337,13 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
}
while (head) {
- bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
- head->free_bit);
- mlog(0, "Free bit: (bit %u, blkno %llu)\n",
- head->free_bit, (unsigned long long)head->free_blk);
+ if (head->free_bg)
+ bg_blkno = head->free_bg;
+ else
+ bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+ head->free_bit);
+ trace_ocfs2_free_cached_blocks(
+ (unsigned long long)head->free_blk, head->free_bit);
ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
head->free_bit, bg_blkno, 1);
@@ -6154,15 +6389,14 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
int ret = 0;
struct ocfs2_cached_block_free *item;
- item = kmalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (item == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
return ret;
}
- mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
- bit, (unsigned long long)blkno);
+ trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
item->free_blk = blkno;
item->free_bit = bit;
@@ -6237,8 +6471,8 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
fl = ctxt->c_first_suballocator;
if (fl->f_first) {
- mlog(0, "Free items: (type %u, slot %d)\n",
- fl->f_inode_type, fl->f_slot);
+ trace_ocfs2_run_deallocs(fl->f_inode_type,
+ fl->f_slot);
ret2 = ocfs2_free_cached_blocks(osb,
fl->f_inode_type,
fl->f_slot,
@@ -6293,9 +6527,9 @@ ocfs2_find_per_slot_free_list(int type,
return fl;
}
-static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 blkno,
- unsigned int bit)
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 suballoc,
+ u64 blkno, unsigned int bit)
{
int ret;
struct ocfs2_per_slot_free_list *fl;
@@ -6308,16 +6542,18 @@ static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
goto out;
}
- item = kmalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (item == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
- type, slot, bit, (unsigned long long)blkno);
+ trace_ocfs2_cache_block_dealloc(type, slot,
+ (unsigned long long)suballoc,
+ (unsigned long long)blkno, bit);
+ item->free_bg = suballoc;
item->free_blk = blkno;
item->free_bit = bit;
item->free_next = fl->f_first;
@@ -6334,421 +6570,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
{
return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(eb->h_suballoc_slot),
+ le64_to_cpu(eb->h_suballoc_loc),
le64_to_cpu(eb->h_blkno),
le16_to_cpu(eb->h_suballoc_bit));
}
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
- unsigned int clusters_to_del,
- struct ocfs2_path *path,
- struct buffer_head **new_last_eb)
-{
- int next_free, ret = 0;
- u32 cpos;
- struct ocfs2_extent_rec *rec;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
- struct buffer_head *bh = NULL;
-
- *new_last_eb = NULL;
-
- /* we have no tree, so of course, no last_eb. */
- if (!path->p_tree_depth)
- goto out;
-
- /* trunc to zero special case - this makes tree_depth = 0
- * regardless of what it is. */
- if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
- goto out;
-
- el = path_leaf_el(path);
- BUG_ON(!el->l_next_free_rec);
-
- /*
- * Make sure that this extent list will actually be empty
- * after we clear away the data. We can shortcut out if
- * there's more than one non-empty extent in the
- * list. Otherwise, a check of the remaining extent is
- * necessary.
- */
- next_free = le16_to_cpu(el->l_next_free_rec);
- rec = NULL;
- if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- if (next_free > 2)
- goto out;
-
- /* We may have a valid extent in index 1, check it. */
- if (next_free == 2)
- rec = &el->l_recs[1];
-
- /*
- * Fall through - no more nonempty extents, so we want
- * to delete this leaf.
- */
- } else {
- if (next_free > 1)
- goto out;
-
- rec = &el->l_recs[0];
- }
-
- if (rec) {
- /*
- * Check it we'll only be trimming off the end of this
- * cluster.
- */
- if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
- goto out;
- }
-
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- eb = (struct ocfs2_extent_block *) bh->b_data;
- el = &eb->h_list;
-
- /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
- * Any corruption is a code bug. */
- BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-
- *new_last_eb = bh;
- get_bh(*new_last_eb);
- mlog(0, "returning block %llu, (cpos: %u)\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
- brelse(bh);
-
- return ret;
-}
-
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- * - start journaling of each path component.
- * - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
- handle_t *handle, struct ocfs2_truncate_context *tc,
- u32 clusters_to_del, u64 *delete_start)
-{
- int ret, i, index = path->p_tree_depth;
- u32 new_edge = 0;
- u64 deleted_eb = 0;
- struct buffer_head *bh;
- struct ocfs2_extent_list *el;
- struct ocfs2_extent_rec *rec;
-
- *delete_start = 0;
-
- while (index >= 0) {
- bh = path->p_node[index].bh;
- el = path->p_node[index].el;
-
- mlog(0, "traveling tree (index = %d, block = %llu)\n",
- index, (unsigned long long)bh->b_blocknr);
-
- BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-
- if (index !=
- (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has invalid ext. block %llu",
- inode->i_ino,
- (unsigned long long)bh->b_blocknr);
- ret = -EROFS;
- goto out;
- }
-
-find_tail_record:
- i = le16_to_cpu(el->l_next_free_rec) - 1;
- rec = &el->l_recs[i];
-
- mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
- "next = %u\n", i, le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec),
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-
- if (le16_to_cpu(el->l_tree_depth) == 0) {
- /*
- * If the leaf block contains a single empty
- * extent and no records, we can just remove
- * the block.
- */
- if (i == 0 && ocfs2_is_empty_extent(rec)) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- el->l_next_free_rec = cpu_to_le16(0);
-
- goto delete;
- }
-
- /*
- * Remove any empty extents by shifting things
- * left. That should make life much easier on
- * the code below. This condition is rare
- * enough that we shouldn't see a performance
- * hit.
- */
- if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- for(i = 0;
- i < le16_to_cpu(el->l_next_free_rec); i++)
- el->l_recs[i] = el->l_recs[i + 1];
-
- memset(&el->l_recs[i], 0,
- sizeof(struct ocfs2_extent_rec));
-
- /*
- * We've modified our extent list. The
- * simplest way to handle this change
- * is to being the search from the
- * start again.
- */
- goto find_tail_record;
- }
-
- le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-
- /*
- * We'll use "new_edge" on our way back up the
- * tree to know what our rightmost cpos is.
- */
- new_edge = le16_to_cpu(rec->e_leaf_clusters);
- new_edge += le32_to_cpu(rec->e_cpos);
-
- /*
- * The caller will use this to delete data blocks.
- */
- *delete_start = le64_to_cpu(rec->e_blkno)
- + ocfs2_clusters_to_blocks(inode->i_sb,
- le16_to_cpu(rec->e_leaf_clusters));
-
- /*
- * If it's now empty, remove this record.
- */
- if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
- } else {
- if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- goto delete;
- }
-
- /* Can this actually happen? */
- if (le16_to_cpu(el->l_next_free_rec) == 0)
- goto delete;
-
- /*
- * We never actually deleted any clusters
- * because our leaf was empty. There's no
- * reason to adjust the rightmost edge then.
- */
- if (new_edge == 0)
- goto delete;
-
- rec->e_int_clusters = cpu_to_le32(new_edge);
- le32_add_cpu(&rec->e_int_clusters,
- -le32_to_cpu(rec->e_cpos));
-
- /*
- * A deleted child record should have been
- * caught above.
- */
- BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
- }
-
-delete:
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- mlog(0, "extent list container %llu, after: record %d: "
- "(%u, %u, %llu), next = %u.\n",
- (unsigned long long)bh->b_blocknr, i,
- le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- /*
- * We must be careful to only attempt delete of an
- * extent block (and not the root inode block).
- */
- if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
- struct ocfs2_extent_block *eb =
- (struct ocfs2_extent_block *)bh->b_data;
-
- /*
- * Save this for use when processing the
- * parent block.
- */
- deleted_eb = le64_to_cpu(eb->h_blkno);
-
- mlog(0, "deleting this extent block.\n");
-
- ocfs2_remove_from_cache(inode, bh);
-
- BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
- BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
- BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-
- ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
- /* An error here is not fatal. */
- if (ret < 0)
- mlog_errno(ret);
- } else {
- deleted_eb = 0;
- }
-
- index--;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
- unsigned int clusters_to_del,
- struct inode *inode,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_truncate_context *tc,
- struct ocfs2_path *path)
-{
- int status;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *last_eb = NULL;
- struct ocfs2_extent_list *el;
- struct buffer_head *last_eb_bh = NULL;
- u64 delete_blk = 0;
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
- path, &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- /*
- * Each component will be touched, so we might as well journal
- * here to avoid having to handle errors later.
- */
- status = ocfs2_journal_access_path(inode, handle, path);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- if (last_eb_bh) {
- status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- }
-
- el = &(fe->id2.i_list);
-
- /*
- * Lower levels depend on this never happening, but it's best
- * to check it up here before changing the tree.
- */
- if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has an empty extent record, depth %u\n",
- inode->i_ino, le16_to_cpu(el->l_tree_depth));
- status = -EROFS;
- goto bail;
- }
-
- vfs_dq_free_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
- clusters_to_del;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- le32_add_cpu(&fe->i_clusters, -clusters_to_del);
- inode->i_blocks = ocfs2_inode_sector_count(inode);
-
- status = ocfs2_trim_tree(inode, path, handle, tc,
- clusters_to_del, &delete_blk);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- if (le32_to_cpu(fe->i_clusters) == 0) {
- /* trunc to zero is a special case. */
- el->l_tree_depth = 0;
- fe->i_last_eb_blk = 0;
- } else if (last_eb)
- fe->i_last_eb_blk = last_eb->h_blkno;
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- if (last_eb) {
- /* If there will be a new last extent block, then by
- * definition, there cannot be any leaves to the right of
- * him. */
- last_eb->h_next_leaf_blk = 0;
- status = ocfs2_journal_dirty(handle, last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- if (delete_blk) {
- status = ocfs2_truncate_log_append(osb, handle, delete_blk,
- clusters_to_del);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
- status = 0;
-bail:
-
- mlog_exit(status);
- return status;
-}
-
static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
@@ -6756,9 +6582,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
- unsigned int from, unsigned int to,
- struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+ unsigned int from, unsigned int to,
+ struct page *page, int zero, u64 *phys)
{
int ret, partial = 0;
@@ -6826,25 +6652,21 @@ out:
ocfs2_unlock_and_free_pages(pages, numpages);
}
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
- struct page **pages, int *num)
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num)
{
int numpages, ret = 0;
- struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
unsigned long index;
loff_t last_page_bytes;
BUG_ON(start > end);
- BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
- (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
-
numpages = 0;
last_page_bytes = PAGE_ALIGN(end);
index = start >> PAGE_CACHE_SHIFT;
do {
- pages[numpages] = grab_cache_page(mapping, index);
+ pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
if (!pages[numpages]) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -6867,6 +6689,17 @@ out:
return ret;
}
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num)
+{
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+ (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+ return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
/*
* Zero the area past i_size but still within an allocated
* cluster. This avoids exposing nonzero data on subsequent file
@@ -6933,14 +6766,13 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
* wait on them - the truncate_inode_pages() call later will
* do that for us.
*/
- ret = do_sync_mapping_range(inode->i_mapping, range_start,
- range_end - 1, SYNC_FILE_RANGE_WRITE);
+ ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
+ range_end - 1);
if (ret)
mlog_errno(ret);
out:
- if (pages)
- kfree(pages);
+ kfree(pages);
return ret;
}
@@ -6994,6 +6826,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
int ret, i, has_data, num_pages = 0;
+ int need_free = 0;
+ u32 bit_off, num;
handle_t *handle;
u64 uninitialized_var(block);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -7031,7 +6865,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_unlock;
}
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -7039,18 +6873,18 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- u32 bit_off, num;
unsigned int page_end;
u64 phys;
- if (vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&num);
if (ret) {
mlog_errno(ret);
@@ -7074,6 +6908,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -7084,6 +6919,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -7101,6 +6937,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_dinode_new_extent_list(inode, di);
ocfs2_journal_dirty(handle, di_bh);
@@ -7111,11 +6948,11 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* this proves to be false, we could always re-build
* the in-inode data from our pages.
*/
- ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
- ret = ocfs2_insert_extent(osb, handle, inode, &et,
- 0, block, 1, 0, NULL);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -7124,9 +6961,21 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock:
@@ -7150,23 +6999,27 @@ out:
*/
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc)
+ struct buffer_head *di_bh)
{
- int status, i, credits, tl_sem = 0;
- u32 clusters_to_del, new_highest_cpos, range;
+ int status = 0, i, flags = 0;
+ u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
+ u64 blkno = 0;
struct ocfs2_extent_list *el;
- handle_t *handle = NULL;
- struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_extent_rec *rec;
struct ocfs2_path *path = NULL;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+ u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+ struct ocfs2_extent_tree et;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
- mlog_entry_void();
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&dealloc);
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+ path = ocfs2_new_path(di_bh, &di->id2.i_list,
ocfs2_journal_access_di);
if (!path) {
status = -ENOMEM;
@@ -7188,14 +7041,17 @@ start:
/*
* Truncate always works against the rightmost tree branch.
*/
- status = ocfs2_find_path(inode, path, UINT_MAX);
+ status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
if (status) {
mlog_errno(status);
goto bail;
}
- mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
- OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+ trace_ocfs2_commit_truncate(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ new_highest_cpos,
+ OCFS2_I(inode)->ip_clusters,
+ path->p_tree_depth);
/*
* By now, el will point to the extent list on the bottom most
@@ -7219,61 +7075,60 @@ start:
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
- range = le32_to_cpu(el->l_recs[i].e_cpos) +
- ocfs2_rec_clusters(el, &el->l_recs[i]);
- if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
- clusters_to_del = 0;
- } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
- clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+ rec = &el->l_recs[i];
+ flags = rec->e_flags;
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (i == 0 && ocfs2_is_empty_extent(rec)) {
+ /*
+ * Lower levels depend on this never happening, but it's best
+ * to check it up here before changing the tree.
+ */
+ if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+ ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ "extent record, depth %u\n", inode->i_ino,
+ le16_to_cpu(root_el->l_tree_depth));
+ status = -EROFS;
+ goto bail;
+ }
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
+ } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+ /*
+ * Truncate entire record.
+ */
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = ocfs2_rec_clusters(el, rec);
+ blkno = le64_to_cpu(rec->e_blkno);
} else if (range > new_highest_cpos) {
- clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
- le32_to_cpu(el->l_recs[i].e_cpos)) -
- new_highest_cpos;
+ /*
+ * Partial truncate. it also should be
+ * the last truncate we're doing.
+ */
+ trunc_cpos = new_highest_cpos;
+ trunc_len = range - new_highest_cpos;
+ coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+ blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
} else {
+ /*
+ * Truncate completed, leave happily.
+ */
status = 0;
goto bail;
}
- mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
- clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-
- mutex_lock(&tl_inode->i_mutex);
- tl_sem = 1;
- /* ocfs2_truncate_log_needs_flush guarantees us at least one
- * record is free for use. If there isn't any, we flush to get
- * an empty truncate log. */
- if (ocfs2_truncate_log_needs_flush(osb)) {
- status = __ocfs2_flush_truncate_log(osb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
- (struct ocfs2_dinode *)fe_bh->b_data,
- el);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto bail;
- }
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
- tc, path);
+ status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags, &dealloc,
+ refcount_loc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mutex_unlock(&tl_inode->i_mutex);
- tl_sem = 0;
-
- ocfs2_commit_trans(osb, handle);
- handle = NULL;
-
ocfs2_reinit_path(path, 1);
/*
@@ -7286,78 +7141,10 @@ bail:
ocfs2_schedule_truncate_log_flush(osb, 1);
- if (tl_sem)
- mutex_unlock(&tl_inode->i_mutex);
-
- if (handle)
- ocfs2_commit_trans(osb, handle);
-
- ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+ ocfs2_run_deallocs(osb, &dealloc);
ocfs2_free_path(path);
- /* This will drop the ext_alloc cluster lock for us */
- ocfs2_free_truncate_context(tc);
-
- mlog_exit(status);
- return status;
-}
-
-/*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc)
-{
- int status;
- unsigned int new_i_clusters;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct buffer_head *last_eb_bh = NULL;
-
- mlog_entry_void();
-
- *tc = NULL;
-
- new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
- i_size_read(inode));
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
- "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
- (unsigned long long)le64_to_cpu(fe->i_size));
-
- *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
- if (!(*tc)) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
- ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-
- if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_extent_block(inode,
- le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- }
-
- (*tc)->tc_last_eb_bh = last_eb_bh;
-
- status = 0;
-bail:
- if (status < 0) {
- if (*tc)
- ocfs2_free_truncate_context(*tc);
- *tc = NULL;
- }
- mlog_exit_void();
return status;
}
@@ -7377,7 +7164,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
if (end > i_size_read(inode))
end = i_size_read(inode);
- BUG_ON(start >= end);
+ BUG_ON(start > end);
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
@@ -7400,7 +7187,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -7427,6 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
@@ -7436,17 +7224,163 @@ out:
return ret;
}
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 count)
{
- /*
- * The caller is responsible for completing deallocation
- * before freeing the context.
- */
- if (tc->tc_dealloc.c_first_suballocator != NULL)
- mlog(ML_NOTICE,
- "Truncate completion has non-empty dealloc context\n");
+ u64 discard, bcount;
+
+ bcount = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
- brelse(tc->tc_last_eb_bh);
+ return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 max, u32 minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
- kfree(tc);
+ if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+ return 0;
+
+ trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+ start, max, minbits);
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
+ }
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
+ }
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, trimmed, first_group, last_group, group;
+ int ret, cnt;
+ u32 first_bit, last_bit, minlen;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+
+ if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
+ return -EINVAL;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = range->len >> osb->s_clustersize_bits;
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ trace_ocfs2_trim_fs(start, len, minlen);
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ trimmed = 0;
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg;
+ else
+ last_bit = first_bit + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 353254ba29e..ca381c58412 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,8 @@
*
* ocfs2_extent_tree contains info for the root of the b-tree, it must have a
* root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions. With metadata ecc, we now call different journal_access
+ * functions. It needs the ocfs2_caching_info structure associated with
+ * I/O on the tree. With metadata ecc, we now call different journal_access
* functions for each type of metadata, so it must have the
* root_journal_access function.
* ocfs2_extent_tree_operations abstract the normal operations we do for
@@ -56,6 +57,7 @@ struct ocfs2_extent_tree {
struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
+ struct ocfs2_caching_info *et_ci;
ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
@@ -66,31 +68,32 @@ struct ocfs2_extent_tree {
* specified object buffer.
*/
void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh);
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh);
struct ocfs2_xattr_value_buf;
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct ocfs2_xattr_value_buf *vb);
void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *bh);
/*
* Read an extent block into *bh. If *bh is NULL, a bh will be
* allocated. This is a cached read. The extent block will be validated
* with ocfs2_validate_extent_block().
*/
-int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
struct buffer_head **bh);
struct ocfs2_alloc_context;
-int ocfs2_insert_extent(struct ocfs2_super *osb,
- handle_t *handle,
- struct inode *inode,
+int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
@@ -103,34 +106,45 @@ enum ocfs2_alloc_restarted {
RESTART_TRANS,
RESTART_META
};
-int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
- struct inode *inode,
+int ocfs2_add_clusters_in_btree(handle_t *handle,
+ struct ocfs2_extent_tree *et,
u32 *logical_offset,
u32 clusters_to_add,
int mark_unwritten,
- struct ocfs2_extent_tree *et,
- handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_mark_extent_written(struct inode *inode,
struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode,
- struct ocfs2_extent_tree *et,
- u32 cpos, u32 len, handle_t *handle,
+int ocfs2_change_extent_flag(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int new_flags, int clear_flags);
+int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
+ u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc);
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct inode *inode,
struct ocfs2_extent_tree *et);
/*
@@ -195,6 +209,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
}
int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
u64 blkno, unsigned int bit);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 suballoc, u64 blkno,
+ unsigned int bit);
static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
{
return c->c_global_allocator != NULL;
@@ -211,21 +228,18 @@ struct ocfs2_truncate_context {
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc);
+ struct buffer_head *di_bh);
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
unsigned int start, unsigned int end, int trunc);
-int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
- u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *root_el, u32 cpos,
+ struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
@@ -254,4 +268,57 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
return !rec->e_leaf_clusters;
}
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+ unsigned int from, unsigned int to,
+ struct page *page, int zero, u64 *phys);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH 5
+
+struct ocfs2_path {
+ int p_tree_depth;
+ ocfs2_journal_access_func p_root_access;
+ struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct ocfs2_path *path,
+ int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+ handle_t *handle,
+ struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+ struct ocfs2_path *left,
+ struct ocfs2_path *right);
#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a148..4a231a166cf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
#include <linux/mpage.h>
#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -44,6 +43,8 @@
#include "suballoc.h"
#include "super.h"
#include "symlink.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -58,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
void *kaddr;
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_symlink_get_block(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
@@ -78,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
+ err = -ENOMEM;
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
@@ -90,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
+ err = -ENOMEM;
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
@@ -100,7 +104,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh_result->b_page);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
@@ -108,7 +112,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -122,12 +126,11 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
bail:
brelse(bh);
- mlog_exit(err);
return err;
}
-static int ocfs2_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
int err = 0;
unsigned int ext_flags;
@@ -135,8 +138,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
u64 p_blkno, count, past_eof;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -164,7 +167,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we're extending i_size on a file
* system which doesn't support holes, in which case BH_New
- * allows block_prepare_write() to zero.
+ * allows __block_write_begin() to zero.
*
* If we see this on a sparse file system, then a truncate has
* raced us and removed the cluster. In this case, we clear
@@ -193,21 +196,21 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
dump_stack();
+ goto bail;
}
+ }
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
- }
+ trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
bail:
if (err < 0)
err = -EIO;
- mlog_exit(err);
return err;
}
@@ -235,13 +238,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
return -EROFS;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (size)
memcpy(kaddr, di->id2.i_data.id_data, size);
/* Clear the remaining part of the page */
memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
SetPageUptodate(page);
@@ -277,7 +280,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
int ret, unlock = 1;
- mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+ (page ? page->index : 0));
ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
@@ -288,7 +292,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
@@ -322,7 +334,6 @@ out_inode_unlock:
out:
if (unlock)
unlock_page(page);
- mlog_exit(ret);
return ret;
}
@@ -395,30 +406,11 @@ out_unlock:
*/
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret;
-
- mlog_entry("(0x%p)\n", page);
-
- ret = block_write_full_page(page, ocfs2_get_block, wbc);
-
- mlog_exit(ret);
-
- return ret;
-}
-
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to)
-{
- int ret;
-
- ret = block_prepare_write(page, from, to, ocfs2_get_block);
+ trace_ocfs2_writepage(
+ (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+ page->index);
- return ret;
+ return block_write_full_page(page, ocfs2_get_block, wbc);
}
/* Taken from ext3. We don't necessarily need the full blown
@@ -457,36 +449,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (!IS_ERR(handle))
- ocfs2_commit_trans(osb, handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
@@ -494,7 +456,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
int err = 0;
struct inode *inode = mapping->host;
- mlog_entry("(block = %llu)\n", (unsigned long long)block);
+ trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)block);
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
@@ -528,8 +491,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
bail:
status = err ? 0 : p_blkno;
- mlog_exit((int)status);
-
return status;
}
@@ -545,6 +506,9 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -561,14 +525,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- /*
- * Any write past EOF is not allowed because we'd be extending.
- */
- if (create && (iblock + max_blocks) > inode_blocks) {
- ret = -EIO;
- goto bail;
- }
-
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -580,14 +536,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail;
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has a hole at block %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)iblock);
- ret = -EROFS;
- goto bail;
- }
+ /* We should already CoW the refcounted extent in case of create. */
+ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
/*
* get_more_blocks() expects us to describe a hole by clearing
@@ -597,20 +547,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
*/
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
map_bh(bh_result, inode->i_sb, p_blkno);
- else {
- /*
- * ocfs2_prepare_inode_for_write() should have caught
- * the case where we'd be filling a hole and triggered
- * a buffered write instead.
- */
- if (create) {
- ret = -EIO;
- mlog_errno(ret);
- goto bail;
- }
-
+ else
clear_buffer_mapped(bh_result);
- }
/* make sure we don't map more than max_blocks blocks here as
that's all the kernel will handle at this point. */
@@ -621,63 +559,51 @@ bail:
return ret;
}
-/*
+/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
void *private)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
int level;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+ if (ocfs2_iocb_is_sem_locked(iocb))
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
- if (!level)
- up_read(&inode->i_alloc_sem);
ocfs2_rw_unlock(inode, level);
}
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3. PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
-{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
- jbd2_journal_invalidatepage(journal, page, offset);
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
if (!page_has_buffers(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return try_to_free_buffers(page);
}
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
- int ret;
-
- mlog_entry_void();
+ struct inode *inode = file_inode(file)->i_mapping->host;
/*
* Fallback to buffered I/O if we see an inode without
@@ -686,14 +612,14 @@ static ssize_t ocfs2_direct_IO(int rw,
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- inode->i_sb->s_bdev, iov, offset,
- nr_segs,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io);
+ /* Fallback to buffered I/O if we are appending. */
+ if (i_size_read(inode) <= offset)
+ return 0;
- mlog_exit(ret);
- return ret;
+ return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -740,7 +666,7 @@ static void ocfs2_clear_page_regions(struct page *page,
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (from || to) {
if (from > cluster_start)
@@ -751,7 +677,7 @@ static void ocfs2_clear_page_regions(struct page *page,
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
/*
@@ -776,7 +702,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
}
/*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
* mapping by now though, and the entire write will be allocating or
* it won't, so not much need to use BH_New.
*
@@ -894,18 +820,17 @@ struct ocfs2_write_cluster_desc {
*/
unsigned c_new;
unsigned c_unwritten;
+ unsigned c_needs_zero;
};
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
- return d->c_new || d->c_unwritten;
-}
-
struct ocfs2_write_ctxt {
/* Logical cluster position / len of write */
u32 w_cpos;
u32 w_clen;
+ /* First cluster allocated in a nonsparse extend */
+ u32 w_first_new_cpos;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -928,11 +853,17 @@ struct ocfs2_write_ctxt {
* out in so that future reads from that region will get
* zero's.
*/
- struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
unsigned int w_num_pages;
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;
/*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
*/
@@ -965,6 +896,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
brelse(wc->w_di_bh);
@@ -983,6 +932,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
return -ENOMEM;
wc->w_cpos = pos >> osb->s_clustersize_bits;
+ wc->w_first_new_cpos = UINT_MAX;
cend = (pos + len - 1) >> osb->s_clustersize_bits;
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
@@ -1082,6 +1032,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
&cluster_start, &cluster_end);
+ /* treat the write as new if the a hole/lseek spanned across
+ * the page boundary.
+ */
+ new = new | ((i_size_read(inode) <= page_offset(page)) &&
+ (page_offset(page) <= user_pos));
+
if (page == wc->w_target_page) {
map_from = user_pos & (PAGE_CACHE_SIZE - 1);
map_to = map_from + user_len;
@@ -1148,23 +1104,37 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
+ loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1181,20 +1151,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1204,11 +1171,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
goto out;
}
}
+ wait_for_stable_page(wc->w_pages[i]);
if (index == target_index)
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
@@ -1217,20 +1187,18 @@ out:
*/
static int ocfs2_write_cluster(struct address_space *mapping,
u32 phys, unsigned int unwritten,
+ unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new, should_zero = 0;
+ int ret, i, new;
u64 v_blkno, p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
new = phys == 0 ? 1 : 0;
- if (new || unwritten)
- should_zero = 1;
-
if (new) {
u32 tmp_pos;
@@ -1260,7 +1228,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
goto out;
}
} else if (unwritten) {
- ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
wc->w_handle, cpos, 1, phys,
meta_ac, &wc->w_dealloc);
@@ -1301,7 +1270,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
if (tmpret) {
mlog_errno(tmpret);
if (ret == 0)
- tmpret = ret;
+ ret = tmpret;
}
}
@@ -1341,7 +1310,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
local_len = osb->s_clustersize - cluster_off;
ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten, data_ac, meta_ac,
+ desc->c_unwritten,
+ desc->c_needs_zero,
+ data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
if (ret) {
mlog_errno(ret);
@@ -1391,14 +1362,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
* newly allocated cluster.
*/
desc = &wc->w_desc[0];
- if (ocfs2_should_zero_cluster(desc))
+ if (desc->c_needs_zero)
ocfs2_figure_cluster_boundaries(osb,
desc->c_cpos,
&wc->w_target_from,
NULL);
desc = &wc->w_desc[wc->w_clen - 1];
- if (ocfs2_should_zero_cluster(desc))
+ if (desc->c_needs_zero)
ocfs2_figure_cluster_boundaries(osb,
desc->c_cpos,
NULL,
@@ -1447,6 +1418,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
goto out;
}
+ /* We should already CoW the refcountd extent. */
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
/*
* Assume worst case - that we're writing in
* the middle of the extent.
@@ -1466,13 +1440,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
phys++;
}
+ /*
+ * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+ * file that got extended. w_first_new_cpos tells us
+ * where the newly allocated clusters are so we can
+ * zero them.
+ */
+ if (desc->c_cpos >= wc->w_first_new_cpos) {
+ BUG_ON(phys == 0);
+ desc->c_needs_zero = 1;
+ }
+
desc->c_phys = phys;
if (phys == 0) {
desc->c_new = 1;
+ desc->c_needs_zero = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
- if (ext_flags & OCFS2_EXT_UNWRITTEN)
+
+ if (ext_flags & OCFS2_EXT_UNWRITTEN) {
desc->c_unwritten = 1;
+ desc->c_needs_zero = 1;
+ }
num_clusters--;
}
@@ -1512,7 +1501,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
ocfs2_commit_trans(osb, handle);
@@ -1557,9 +1546,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = NULL;
- mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
- (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
- oi->ip_dyn_features);
+ trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+ len, (unsigned long long)pos,
+ oi->ip_dyn_features);
/*
* Handle inodes which already have inline data 1st.
@@ -1618,34 +1607,86 @@ out:
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
- unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
return ret;
}
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ mutex_lock(&osb->osb_tl_inode->i_mutex);
+ truncated_clusters = osb->truncated_clusters;
+ mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
- int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
- unsigned int clusters_to_alloc, extents_to_split;
+ int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
+ unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
struct ocfs2_write_ctxt *wc;
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1654,7 +1695,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
struct ocfs2_extent_tree et;
+ int try_free = 1, ret1;
+try_again:
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
mlog_errno(ret);
@@ -1674,21 +1717,47 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
if (ret) {
mlog_errno(ret);
goto out;
}
+ ret = ocfs2_check_range_for_refcount(inode, pos, len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ } else if (ret == 1) {
+ clusters_need = wc->w_clen;
+ ret = ocfs2_refcount_cow(inode, di_bh,
+ wc->w_cpos, wc->w_clen, UINT_MAX);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
&extents_to_split);
if (ret) {
mlog_errno(ret);
goto out;
}
+ clusters_need += clusters_to_alloc;
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ trace_ocfs2_write_begin_nolock(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode),
+ le32_to_cpu(di->i_clusters),
+ pos, len, flags, mmap_page,
+ clusters_to_alloc, extents_to_split);
+
/*
* We set w_target_from, w_target_to here so that
* ocfs2_write_end() knows which range in the target page to
@@ -1701,13 +1770,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* ocfs2_lock_allocators(). It greatly over-estimates
* the work to be done.
*/
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
- " clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
- clusters_to_alloc, extents_to_split);
-
- ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ wc->w_di_bh);
ret = ocfs2_lock_allocators(inode, &et,
clusters_to_alloc, extents_to_split,
&data_ac, &meta_ac);
@@ -1716,14 +1780,27 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
goto out;
}
+ if (data_ac)
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &di->id2.i_list,
- clusters_to_alloc);
+ &di->id2.i_list);
}
- ocfs2_set_target_boundaries(osb, wc, pos, len,
- clusters_to_alloc + extents_to_split);
+ /*
+ * We have to zero sparse allocated clusters, unwritten extent clusters,
+ * and non-sparse clusters we just extended. For non-sparse writes,
+ * we know zeros will only be needed in the first and/or last cluster.
+ */
+ if (clusters_to_alloc || extents_to_split ||
+ (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ cluster_of_pages = 1;
+ else
+ cluster_of_pages = 0;
+
+ ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
@@ -1734,16 +1811,17 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
wc->w_handle = handle;
- if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
- ret = -EDQUOT;
- goto out_commit;
+ if (clusters_to_alloc) {
+ ret = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
+ if (ret)
+ goto out_commit;
}
/*
* We don't want this to fail in ocfs2_write_end(), so do it
* here.
*/
- ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1755,14 +1833,25 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
- clusters_to_alloc + extents_to_split,
- mmap_page);
- if (ret) {
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
+ cluster_of_pages, mmap_page);
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
@@ -1781,7 +1870,7 @@ success:
return 0;
out_quota:
if (clusters_to_alloc)
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -1789,10 +1878,30 @@ out_commit:
out:
ocfs2_free_write_ctxt(wc);
- if (data_ac)
+ if (data_ac) {
ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
+ data_ac = NULL;
+ }
+ if (meta_ac) {
ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+
+ if (ret == -ENOSPC && try_free) {
+ /*
+ * Try to free some truncate log so that we can have enough
+ * clusters to allocate.
+ */
+ try_free = 0;
+
+ ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+ if (ret1 == 1)
+ goto try_again;
+
+ if (ret1 < 0)
+ mlog_errno(ret1);
+ }
+
return ret;
}
@@ -1819,7 +1928,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
@@ -1853,12 +1962,12 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
}
- kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+ kaddr = kmap_atomic(wc->w_target_page);
memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- mlog(0, "Data written to inode at offset %llu. "
- "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+ trace_ocfs2_write_end_inline(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)pos, *copied,
le16_to_cpu(di->id2.i_data.id_count),
le16_to_cpu(di->i_dyn_features));
@@ -1920,7 +2029,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
out_write_size:
pos += copied;
- if (pos > inode->i_size) {
+ if (pos > i_size_read(inode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
@@ -1929,6 +2038,7 @@ out_write_size:
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
ocfs2_commit_trans(osb, handle);
@@ -1962,10 +2072,10 @@ const struct address_space_operations ocfs2_aops = {
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
- .sync_page = block_sync_page,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = ocfs2_invalidatepage,
+ .invalidatepage = block_invalidatepage,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e49232e1..6cae155d54d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,8 +22,7 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to);
+#include <linux/aio.h>
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
@@ -48,7 +47,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct buffer_head *di_bh);
int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
test_bit(0, (unsigned long *)&iocb->private)
@@ -68,8 +70,36 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
else
clear_bit(1, (unsigned long *)&iocb->private);
}
+
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication between
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+ OCFS2_IOCB_RW_LOCK = 0,
+ OCFS2_IOCB_RW_LOCK_LEVEL,
+ OCFS2_IOCB_SEM,
+ OCFS2_IOCB_UNALIGNED_IO,
+ OCFS2_IOCB_NUM_LOCKS
+};
+
#define ocfs2_iocb_clear_rw_locked(iocb) \
- clear_bit(0, (unsigned long *)&iocb->private)
+ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
- test_bit(1, (unsigned long *)&iocb->private)
+ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+ set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+ clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+ test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+ clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2a947c44e59..0725e605465 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -22,6 +22,9 @@
#include <linux/crc32.h>
#include <linux/buffer_head.h>
#include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
#include <asm/byteorder.h>
#include <cluster/masklog.h>
@@ -44,7 +47,7 @@
* Calculate the bit offset in the hamming code buffer based on the bit's
* offset in the data buffer. Since the hamming code reserves all
* power-of-two bits for parity, the data bit number and the code bit
- * number are offest by all the parity bits beforehand.
+ * number are offset by all the parity bits beforehand.
*
* Recall that bit numbers in hamming code are 1-based. This function
* takes the 0-based data bit from the caller.
@@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
}
+
+/*
+ * Debugfs handling.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+ *val = *(u64 *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+
+static struct dentry *blockcheck_debugfs_create(const char *name,
+ struct dentry *parent,
+ u64 *value)
+{
+ return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+ &blockcheck_fops);
+}
+
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+ if (stats) {
+ debugfs_remove(stats->b_debug_check);
+ stats->b_debug_check = NULL;
+ debugfs_remove(stats->b_debug_failure);
+ stats->b_debug_failure = NULL;
+ debugfs_remove(stats->b_debug_recover);
+ stats->b_debug_recover = NULL;
+ debugfs_remove(stats->b_debug_dir);
+ stats->b_debug_dir = NULL;
+ }
+}
+
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
+{
+ int rc = -EINVAL;
+
+ if (!stats)
+ goto out;
+
+ stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+ if (!stats->b_debug_dir)
+ goto out;
+
+ stats->b_debug_check =
+ blockcheck_debugfs_create("blocks_checked",
+ stats->b_debug_dir,
+ &stats->b_check_count);
+
+ stats->b_debug_failure =
+ blockcheck_debugfs_create("checksums_failed",
+ stats->b_debug_dir,
+ &stats->b_failure_count);
+
+ stats->b_debug_recover =
+ blockcheck_debugfs_create("ecc_recoveries",
+ stats->b_debug_dir,
+ &stats->b_recover_count);
+ if (stats->b_debug_check && stats->b_debug_failure &&
+ stats->b_debug_recover)
+ rc = 0;
+
+out:
+ if (rc)
+ ocfs2_blockcheck_debug_remove(stats);
+ return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
+{
+ return 0;
+}
+
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent)
+{
+ return ocfs2_blockcheck_debug_install(stats, parent);
+}
+
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+ ocfs2_blockcheck_debug_remove(stats);
+}
+
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+ u64 new_count;
+
+ if (!stats)
+ return;
+
+ spin_lock(&stats->b_lock);
+ stats->b_check_count++;
+ new_count = stats->b_check_count;
+ spin_unlock(&stats->b_lock);
+
+ if (!new_count)
+ mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+ u64 new_count;
+
+ if (!stats)
+ return;
+
+ spin_lock(&stats->b_lock);
+ stats->b_failure_count++;
+ new_count = stats->b_failure_count;
+ spin_unlock(&stats->b_lock);
+
+ if (!new_count)
+ mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+ u64 new_count;
+
+ if (!stats)
+ return;
+
+ spin_lock(&stats->b_lock);
+ stats->b_recover_count++;
+ new_count = stats->b_recover_count;
+ spin_unlock(&stats->b_lock);
+
+ if (!new_count)
+ mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+
+
+
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
+
/*
* This function generates check information for a block.
* data is the block to be checked. bc is a pointer to the
@@ -251,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
* No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
* larger than 16 bits.
*/
- BUG_ON(ecc > USHORT_MAX);
+ BUG_ON(ecc > USHRT_MAX);
bc->bc_crc32e = cpu_to_le32(crc);
bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -266,43 +418,50 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
* Again, the data passed in should be the on-disk endian.
*/
int ocfs2_block_check_validate(void *data, size_t blocksize,
- struct ocfs2_block_check *bc)
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats)
{
int rc = 0;
- struct ocfs2_block_check check;
+ u32 bc_crc32e;
+ u16 bc_ecc;
u32 crc, ecc;
- check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
- check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+ ocfs2_blockcheck_inc_check(stats);
+
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
crc = crc32_le(~0, data, blocksize);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e)
goto out;
+ ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
- "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
ecc = ocfs2_hamming_encode_block(data, blocksize);
- ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+ ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
/* And check the crc32 again */
crc = crc32_le(~0, data, blocksize);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e) {
+ ocfs2_blockcheck_inc_recover(stats);
goto out;
+ }
- mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
- bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
- bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
@@ -350,7 +509,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
* No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
* larger than 16 bits.
*/
- BUG_ON(ecc > USHORT_MAX);
+ BUG_ON(ecc > USHRT_MAX);
bc->bc_crc32e = cpu_to_le32(crc);
bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -366,10 +525,12 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
* Again, the data passed in should be the on-disk endian.
*/
int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
- struct ocfs2_block_check *bc)
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats)
{
int i, rc = 0;
- struct ocfs2_block_check check;
+ u32 bc_crc32e;
+ u16 bc_ecc;
u32 crc, ecc, fix;
BUG_ON(nr < 0);
@@ -377,20 +538,23 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
if (!nr)
return 0;
- check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
- check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+ ocfs2_blockcheck_inc_check(stats);
+
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e)
goto out;
+ ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
"CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
for (i = 0, ecc = 0; i < nr; i++) {
@@ -403,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
bhs[i]->b_size * 8,
bhs[i]->b_size * 8 * i);
}
- fix = ecc ^ check.bc_ecc;
+ fix = ecc ^ bc_ecc;
for (i = 0; i < nr; i++) {
/*
* Try the fix against each buffer. It will only affect
@@ -416,17 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
/* And check the crc32 again */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e) {
+ ocfs2_blockcheck_inc_recover(stats);
goto out;
+ }
mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
- bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
- bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
@@ -448,9 +614,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
struct ocfs2_block_check *bc)
{
int rc = 0;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
- if (ocfs2_meta_ecc(OCFS2_SB(sb)))
- rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+ if (ocfs2_meta_ecc(osb))
+ rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+ &osb->osb_ecc_stats);
return rc;
}
@@ -468,9 +636,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
struct ocfs2_block_check *bc)
{
int rc = 0;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
- if (ocfs2_meta_ecc(OCFS2_SB(sb)))
- rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+ if (ocfs2_meta_ecc(osb))
+ rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+ &osb->osb_ecc_stats);
return rc;
}
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 70ec3feda32..d4b69febf70 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -21,6 +21,24 @@
#define OCFS2_BLOCKCHECK_H
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+ spinlock_t b_lock;
+ u64 b_check_count; /* Number of blocks we've checked */
+ u64 b_failure_count; /* Number of failed checksums */
+ u64 b_recover_count; /* Number of blocks fixed by ecc */
+
+ /*
+ * debugfs entries, used if this is passed to
+ * ocfs2_blockcheck_stats_debugfs_install()
+ */
+ struct dentry *b_debug_dir; /* Parent of the debugfs files */
+ struct dentry *b_debug_check; /* Exposes b_check_count */
+ struct dentry *b_debug_failure; /* Exposes b_failure_count */
+ struct dentry *b_debug_recover; /* Exposes b_recover_count */
+};
+
+
/* High level block API */
void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
struct ocfs2_block_check *bc);
@@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
void ocfs2_block_check_compute(void *data, size_t blocksize,
struct ocfs2_block_check *bc);
int ocfs2_block_check_validate(void *data, size_t blocksize,
- struct ocfs2_block_check *bc);
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats);
void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_block_check *bc);
int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
- struct ocfs2_block_check *bc);
+ struct ocfs2_block_check *bc,
+ struct ocfs2_blockcheck_stats *stats);
+
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+ struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
/*
* Hamming code functions
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 15c8e6deee2..1edcb141f63 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
@@ -36,8 +35,8 @@
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
-
#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
/*
* Bits on bh->b_state used by ocfs2.
@@ -52,12 +51,11 @@ enum ocfs2_state_bits {
BUFFER_FNS(NeedsValidate, needs_validate);
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
- struct inode *inode)
+ struct ocfs2_caching_info *ci)
{
int ret = 0;
- mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
- (unsigned long long)bh->b_blocknr, inode);
+ trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
@@ -67,10 +65,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* can get modified during recovery even if read-only. */
if (ocfs2_is_hard_readonly(osb)) {
ret = -EROFS;
+ mlog_errno(ret);
goto out;
}
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
lock_buffer(bh);
set_buffer_uptodate(bh);
@@ -85,18 +84,17 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
wait_on_buffer(bh);
if (buffer_uptodate(bh)) {
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(ci, bh);
} else {
/* We don't need to remove the clustered uptodate
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- put_bh(bh);
+ mlog_errno(ret);
}
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
out:
- mlog_exit(ret);
return ret;
}
@@ -107,16 +105,16 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int i;
struct buffer_head *bh;
- if (!nr) {
- mlog(ML_BH_IO, "No buffers will be read!\n");
+ trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+ if (!nr)
goto bail;
- }
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -124,10 +122,8 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
bh = bhs[i];
if (buffer_jbd(bh)) {
- mlog(ML_BH_IO,
- "trying to sync read a jbd "
- "managed bh (blocknr = %llu), skipping\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_read_blocks_sync_jbd(
+ (unsigned long long)bh->b_blocknr);
continue;
}
@@ -177,7 +173,7 @@ bail:
return status;
}
-int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
@@ -185,11 +181,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
int status = 0;
int i, ignore_cache = 0;
struct buffer_head *bh;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
- mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
- inode, (unsigned long long)block, nr, flags);
+ trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
- BUG_ON(!inode);
+ BUG_ON(!ci);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
(flags & OCFS2_BH_IGNORE_CACHE));
@@ -207,18 +203,17 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
}
if (nr == 0) {
- mlog(ML_BH_IO, "No buffers will be read!\n");
status = 0;
goto bail;
}
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
- bhs[i] = sb_getblk(inode->i_sb, block++);
+ bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
- status = -EIO;
+ ocfs2_metadata_cache_io_unlock(ci);
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -250,21 +245,19 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* before our is-it-in-flight check.
*/
- if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
- mlog(ML_UPTODATE,
- "bh (%llu), inode %llu not uptodate\n",
+ if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
+ trace_ocfs2_read_blocks_from_disk(
(unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
/* We're using ignore_cache here to say
* "go to disk" */
ignore_cache = 1;
}
+ trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+ ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
if (buffer_jbd(bh)) {
- if (ignore_cache)
- mlog(ML_BH_IO, "trying to sync read a jbd "
- "managed bh (blocknr = %llu)\n",
- (unsigned long long)bh->b_blocknr);
continue;
}
@@ -272,9 +265,6 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
- mlog(ML_BH_IO, "asking me to sync read a dirty "
- "buffer! (blocknr = %llu)\n",
- (unsigned long long)bh->b_blocknr);
continue;
}
@@ -283,7 +273,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* previously submitted request than we are
* done here. */
if ((flags & OCFS2_BH_READAHEAD)
- && ocfs2_buffer_read_ahead(inode, bh))
+ && ocfs2_buffer_read_ahead(ci, bh))
continue;
lock_buffer(bh);
@@ -305,7 +295,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* buffer lock. */
if (!(flags & OCFS2_BH_IGNORE_CACHE)
&& !(flags & OCFS2_BH_READAHEAD)
- && ocfs2_buffer_uptodate(inode, bh)) {
+ && ocfs2_buffer_uptodate(ci, bh)) {
unlock_buffer(bh);
continue;
}
@@ -327,7 +317,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
if (!(flags & OCFS2_BH_READAHEAD)) {
/* We know this can't have changed as we hold the
- * inode sem. Avoid doing any work on the bh if the
+ * owner sem. Avoid doing any work on the bh if the
* journal has it. */
if (!buffer_jbd(bh))
wait_on_buffer(bh);
@@ -351,7 +341,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
* that better not have changed */
BUG_ON(buffer_jbd(bh));
clear_buffer_needs_validate(bh);
- status = validate(inode->i_sb, bh);
+ status = validate(sb, bh);
if (status) {
put_bh(bh);
bhs[i] = NULL;
@@ -363,18 +353,15 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(ci, bh);
}
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
- mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
- (unsigned long long)block, nr,
- ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
- flags);
+ trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+ flags, ignore_cache);
bail:
- mlog_exit(status);
return status;
}
@@ -399,21 +386,21 @@ static void ocfs2_check_super_or_backup(struct super_block *sb,
/*
* Write super block and backups doesn't need to collaborate with journal,
- * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed
+ * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
* into this function.
*/
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh)
{
int ret = 0;
-
- mlog_entry_void();
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
BUG_ON(buffer_jbd(bh));
ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
ret = -EROFS;
+ mlog_errno(ret);
goto out;
}
@@ -425,16 +412,16 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ret = -EIO;
- put_bh(bh);
+ mlog_errno(ret);
}
out:
- mlog_exit(ret);
return ret;
}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c75d682dadd..b97bcc6dde7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
- struct inode *inode);
+ struct ocfs2_caching_info *ci);
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[]);
@@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
* be set even for a READAHEAD call, as it marks the buffer for later
* validation.
*/
-int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh));
@@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
#define OCFS2_BH_IGNORE_CACHE 1
#define OCFS2_BH_READAHEAD 8
-static inline int ocfs2_read_block(struct inode *inode, u64 off,
+static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
struct buffer_head **bh,
int (*validate)(struct super_block *sb,
struct buffer_head *bh))
@@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
+ status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
bail:
return status;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d860..1aefc0350ec 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o netdebug.o ver.o
+ quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d0461..73039295d0d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,8 @@
#include <linux/crc32.h>
#include <linux/time.h>
#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -61,10 +63,53 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * - o2hb_region_bitmap allows us to limit the region number to max region.
+ * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * heartbeat on it.
+ * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES 0
+#define O2HB_DB_TYPE_LIVEREGIONS 1
+#define O2HB_DB_TYPE_QUORUMREGIONS 2
+#define O2HB_DB_TYPE_FAILEDREGIONS 3
+#define O2HB_DB_TYPE_REGION_LIVENODES 4
+#define O2HB_DB_TYPE_REGION_NUMBER 5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
+#define O2HB_DB_TYPE_REGION_PINNED 7
+struct o2hb_debug_buf {
+ int db_type;
+ int db_size;
+ int db_len;
+ void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
#define O2HB_DEBUG_DIR "o2hb"
#define O2HB_DEBUG_LIVENODES "livenodes"
+#define O2HB_DEBUG_LIVEREGIONS "live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER "num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED "pinned"
+
static struct dentry *o2hb_debug_dir;
static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
static LIST_HEAD(o2hb_all_regions);
@@ -76,9 +121,48 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
#define O2HB_DEFAULT_BLOCK_BITS 9
+enum o2hb_heartbeat_modes {
+ O2HB_HEARTBEAT_LOCAL = 0,
+ O2HB_HEARTBEAT_GLOBAL,
+ O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
+};
+
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF 3
-/* Only sets a new threshold if there are no active regions.
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
+
+/* Only sets a new threshold if there are no active regions.
*
* No locking or otherwise interesting code is required for reading
* o2hb_dead_threshold as it can't change once regions are active and
@@ -93,6 +177,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
+{
+ int ret = -1;
+
+ if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+ spin_lock(&o2hb_live_lock);
+ if (list_empty(&o2hb_all_regions)) {
+ o2hb_heartbeat_mode = hb_mode;
+ ret = 0;
+ }
+ spin_unlock(&o2hb_live_lock);
+ }
+
+ return ret;
+}
+
struct o2hb_node_event {
struct list_head hn_item;
enum o2hb_callback_type hn_event_type;
@@ -116,7 +216,10 @@ struct o2hb_region {
struct config_item hr_item;
struct list_head hr_all_item;
- unsigned hr_unclean_stop:1;
+ unsigned hr_unclean_stop:1,
+ hr_aborted_start:1,
+ hr_item_pinned:1,
+ hr_item_dropped:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -134,11 +237,29 @@ struct o2hb_region {
struct block_device *hr_bdev;
struct o2hb_disk_slot *hr_slots;
+ /* live node map of this region */
+ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned int hr_region_num;
+
+ struct dentry *hr_debug_dir;
+ struct dentry *hr_debug_livenodes;
+ struct dentry *hr_debug_regnum;
+ struct dentry *hr_debug_elapsed_time;
+ struct dentry *hr_debug_pinned;
+ struct o2hb_debug_buf *hr_db_livenodes;
+ struct o2hb_debug_buf *hr_db_regnum;
+ struct o2hb_debug_buf *hr_db_elapsed_time;
+ struct o2hb_debug_buf *hr_db_pinned;
+
/* let the person setting up hb wait for it to return until it
* has reached a 'steady' state. This will be fixed when we have
* a more complete api that doesn't lead to this sort of fragility. */
atomic_t hr_steady_iterations;
+ /* terminate o2hb thread if it does not reach steady state
+ * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+ atomic_t hr_unsteady_iterations;
+
char hr_dev_name[BDEVNAME_SIZE];
unsigned int hr_timeout_ms;
@@ -164,20 +285,54 @@ struct o2hb_bio_wait_ctxt {
static void o2hb_write_timeout(struct work_struct *work)
{
+ int failed, quorum;
+ unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
"milliseconds\n", reg->hr_dev_name,
- jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+ jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ failed = bitmap_weight(o2hb_failed_region_bitmap,
+ O2NM_MAX_REGIONS);
+ quorum = bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS);
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+ mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+ quorum, failed);
+
+ /*
+ * Fence if the number of failed regions >= half the number
+ * of quorum regions
+ */
+ if ((failed << 1) < quorum)
+ return;
+ }
+
o2quo_disk_timeout();
}
static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
- mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+ /* Arm writeout only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
+ mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
+ O2HB_MAX_WRITE_TIMEOUT_MS);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ }
cancel_delayed_work(&reg->hr_write_timeout_work);
reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -186,8 +341,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
{
- cancel_delayed_work(&reg->hr_write_timeout_work);
- flush_scheduled_work();
+ cancel_delayed_work_sync(&reg->hr_write_timeout_work);
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -214,11 +368,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
static void o2hb_wait_on_io(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc)
{
- struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-
- blk_run_address_space(mapping);
o2hb_bio_wait_dec(wc, 1);
-
wait_for_completion(&wc->wc_io_complete);
}
@@ -263,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
}
/* Must put everything in 512 byte sectors for the bio... */
- bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
bio->bi_bdev = reg->hr_bdev;
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
@@ -342,7 +492,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
}
atomic_inc(&write_wc->wc_num_reqs);
- submit_bio(WRITE, bio);
+ submit_bio(WRITE_SYNC, bio);
status = 0;
bail:
@@ -388,27 +538,50 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
return read == computed;
}
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
{
- int node_num, ret;
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
+ char *errstr;
- node_num = o2nm_this_node();
-
- ret = 1;
- slot = &reg->hr_slots[node_num];
+ slot = &reg->hr_slots[o2nm_this_node()];
/* Don't check on our 1st timestamp */
- if (slot->ds_last_time) {
- hb_block = slot->ds_raw_block;
+ if (!slot->ds_last_time)
+ return 0;
- if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
- ret = 0;
- }
+ hb_block = slot->ds_raw_block;
+ if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
+ le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
+ hb_block->hb_node == slot->ds_node_num)
+ return 1;
+
+#define ERRSTR1 "Another node is heartbeating on device"
+#define ERRSTR2 "Heartbeat generation mismatch on device"
+#define ERRSTR3 "Heartbeat sequence mismatch on device"
+
+ if (hb_block->hb_node != slot->ds_node_num)
+ errstr = ERRSTR1;
+ else if (le64_to_cpu(hb_block->hb_generation) !=
+ slot->ds_last_generation)
+ errstr = ERRSTR2;
+ else
+ errstr = ERRSTR3;
- return ret;
+ mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+ slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
+ (unsigned long long)slot->ds_last_time, hb_block->hb_node,
+ (unsigned long long)le64_to_cpu(hb_block->hb_generation),
+ (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+ return 0;
}
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -447,11 +620,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
struct o2nm_node *node,
int idx)
{
- struct list_head *iter;
struct o2hb_callback_func *f;
- list_for_each(iter, &hbcall->list) {
- f = list_entry(iter, struct o2hb_callback_func, hc_item);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
(f->hc_func)(node, idx, f->hc_data);
}
@@ -460,16 +631,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
/* Will run the list in order until we process the passed event */
static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
{
- int empty;
struct o2hb_callback *hbcall;
struct o2hb_node_event *event;
- spin_lock(&o2hb_live_lock);
- empty = list_empty(&queued_event->hn_item);
- spin_unlock(&o2hb_live_lock);
- if (empty)
- return;
-
/* Holding callback sem assures we don't alter the callback
* lists when doing this, and serializes ourselves with other
* processes wanting callbacks. */
@@ -511,6 +675,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
{
assert_spin_locked(&o2hb_live_lock);
+ BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
event->hn_event_type = type;
event->hn_node = node;
event->hn_node_num = node_num;
@@ -526,6 +692,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
struct o2hb_node_event event =
{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
struct o2nm_node *node;
+ int queued = 0;
node = o2nm_get_node_by_num(slot->ds_node_num);
if (!node)
@@ -543,15 +710,60 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
slot->ds_node_num);
+ queued = 1;
}
}
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
o2nm_node_put(node);
}
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
+{
+ if (!o2hb_global_heartbeat_active())
+ return;
+
+ /* Prevent race with o2hb_heartbeat_group_drop_item() */
+ if (kthread_should_stop())
+ return;
+
+ /* Tag region as quorum only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
+ spin_lock(&o2hb_live_lock);
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ goto unlock;
+
+ /*
+ * A region can be added to the quorum only when it sees all
+ * live nodes heartbeat on it. In other words, the region has been
+ * added to all nodes.
+ */
+ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ sizeof(o2hb_live_node_bitmap)))
+ goto unlock;
+
+ printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+ set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+ /*
+ * If global heartbeat active, unpin all regions if the
+ * region count > CUT_OFF
+ */
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+ o2hb_region_unpin(NULL);
+unlock:
+ spin_unlock(&o2hb_live_lock);
+}
+
static int o2hb_check_slot(struct o2hb_region *reg,
struct o2hb_disk_slot *slot)
{
@@ -563,14 +775,23 @@ static int o2hb_check_slot(struct o2hb_region *reg,
u64 cputime;
unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
unsigned int slot_dead_ms;
+ int tmp;
+ int queued = 0;
memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
- /* Is this correct? Do we assume that the node doesn't exist
- * if we're not configured for him? */
+ /*
+ * If a node is no longer configured but is still in the livemap, we
+ * may need to clear that bit from the livemap.
+ */
node = o2nm_get_node_by_num(slot->ds_node_num);
- if (!node)
- return 0;
+ if (!node) {
+ spin_lock(&o2hb_live_lock);
+ tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ if (!tmp)
+ return 0;
+ }
if (!o2hb_verify_crc(reg, hb_block)) {
/* all paths from here will drop o2hb_live_lock for
@@ -623,7 +844,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
"seq %llu last %llu changed %u equal %u\n",
slot->ds_node_num, (long long)slot->ds_last_generation,
le32_to_cpu(hb_block->hb_cksum),
- (unsigned long long)le64_to_cpu(hb_block->hb_seq),
+ (unsigned long long)le64_to_cpu(hb_block->hb_seq),
(unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
slot->ds_equal_samples);
@@ -637,14 +858,19 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
slot->ds_node_num, (long long)slot->ds_last_generation);
+ set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+ "bitmap\n", slot->ds_node_num);
set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
slot->ds_node_num);
changed = 1;
+ queued = 1;
}
list_add_tail(&slot->ds_live_item,
@@ -682,15 +908,21 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d left my region\n",
slot->ds_node_num);
+ clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* last off the live_slot generates a callback */
list_del_init(&slot->ds_live_item);
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+ "nodes bitmap\n", slot->ds_node_num);
clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
- o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
- slot->ds_node_num);
+ /* node can be null */
+ o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+ node, slot->ds_node_num);
changed = 1;
+ queued = 1;
}
/* We don't clear this because the node is still
@@ -706,48 +938,50 @@ fire_callbacks:
out:
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
- o2nm_node_put(node);
+ if (node)
+ o2nm_node_put(node);
return changed;
}
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
- int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
{
- int highest, node;
-
- highest = numbits;
- node = -1;
- while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
- if (node >= numbits)
- break;
-
- highest = node;
- }
-
- return highest;
+ return find_last_bit(nodes, numbits);
}
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node, change = 0;
+ int i, ret, highest_node;
+ int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
ret = o2nm_configured_node_map(configured_nodes,
sizeof(configured_nodes));
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
+ }
+
+ /*
+ * If a node is not configured but is in the livemap, we still need
+ * to read the slot so as to be able to remove it from the livemap.
+ */
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ set_bit(i, configured_nodes);
}
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return -EINVAL;
+ mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+ ret = -EINVAL;
+ goto bail;
}
/* No sense in reading the slots of nodes that don't exist
@@ -757,31 +991,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/* With an up to date view of the slots, we can check that no
* other node has been improperly configured to heartbeat in
* our slot. */
- if (!o2hb_check_last_timestamp(reg))
- mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
- "in our slot!\n", reg->hr_dev_name);
+ own_slot_ok = o2hb_check_own_slot(reg);
/* fill in the proper info for our next heartbeat */
o2hb_prepare_block(reg, reg->hr_generation);
- /* And fire off the write. Note that we don't wait on this I/O
- * until later. */
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
i = -1;
- while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
-
- change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+ while((i = find_next_bit(configured_nodes,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
}
/*
@@ -796,18 +1026,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* disk */
mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
write_wc.wc_error, reg->hr_dev_name);
- return write_wc.wc_error;
+ ret = write_wc.wc_error;
+ goto bail;
}
- o2hb_arm_write_timeout(reg);
+ /* Skip disarming the timeout if own slot has stale/bad data */
+ if (own_slot_ok) {
+ o2hb_set_quorum_device(reg);
+ o2hb_arm_write_timeout(reg);
+ }
+bail:
/* let the person who launched us know when things are steady */
- if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
- if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (!ret && own_slot_ok && !membership_change) {
+ if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ wake_up(&o2hb_steady_queue);
+ }
+ }
+
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+ printk(KERN_NOTICE "o2hb: Unable to stabilize "
+ "heartbeart on region %s (%s)\n",
+ config_item_name(&reg->hr_item),
+ reg->hr_dev_name);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
+ ret = -EIO;
+ }
}
- return 0;
+ return ret;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -856,9 +1107,13 @@ static int o2hb_thread(void *data)
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
- set_user_nice(current, -20);
+ set_user_nice(current, MIN_NICE);
- while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+ /* Pin node */
+ o2nm_depend_this_node();
+
+ while (!kthread_should_stop() &&
+ !reg->hr_unclean_stop && !reg->hr_aborted_start) {
/* We track the time spent inside
* o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
@@ -866,20 +1121,19 @@ static int o2hb_thread(void *data)
* likely to time itself out. */
do_gettimeofday(&before_hb);
- i = 0;
- do {
- ret = o2hb_do_disk_heartbeat(reg);
- } while (ret && ++i < 2);
+ ret = o2hb_do_disk_heartbeat(reg);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
- mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+ mlog(ML_HEARTBEAT,
+ "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
elapsed_msec);
- if (elapsed_msec < reg->hr_timeout_ms) {
+ if (!kthread_should_stop() &&
+ elapsed_msec < reg->hr_timeout_ms) {
/* the kthread api has blocked signals for us so no
* need to record the return value. */
msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -896,17 +1150,20 @@ static int o2hb_thread(void *data)
* to timeout on this region when we could just as easily
* write a clear generation - thus indicating to them that
* this node has left this region.
- *
- * XXX: Should we skip this on unclean_stop? */
- o2hb_prepare_block(reg, 0);
- ret = o2hb_issue_node_write(reg, &write_wc);
- if (ret == 0) {
- o2hb_wait_on_io(reg, &write_wc);
- } else {
- mlog_errno(ret);
+ */
+ if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+ o2hb_prepare_block(reg, 0);
+ ret = o2hb_issue_node_write(reg, &write_wc);
+ if (ret == 0)
+ o2hb_wait_on_io(reg, &write_wc);
+ else
+ mlog_errno(ret);
}
- mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+ /* Unpin node */
+ o2nm_undepend_this_node();
+
+ mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
return 0;
}
@@ -914,21 +1171,68 @@ static int o2hb_thread(void *data)
#ifdef CONFIG_DEBUG_FS
static int o2hb_debug_open(struct inode *inode, struct file *file)
{
+ struct o2hb_debug_buf *db = inode->i_private;
+ struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long lts;
char *buf = NULL;
int i = -1;
int out = 0;
+ /* max_nodes should be the largest bitmap we pass here */
+ BUG_ON(sizeof(map) < db->db_size);
+
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto bail;
- o2hb_fill_node_map(map, sizeof(map));
+ switch (db->db_type) {
+ case O2HB_DB_TYPE_LIVENODES:
+ case O2HB_DB_TYPE_LIVEREGIONS:
+ case O2HB_DB_TYPE_QUORUMREGIONS:
+ case O2HB_DB_TYPE_FAILEDREGIONS:
+ spin_lock(&o2hb_live_lock);
+ memcpy(map, db->db_data, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
- while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ case O2HB_DB_TYPE_REGION_LIVENODES:
+ spin_lock(&o2hb_live_lock);
+ reg = (struct o2hb_region *)db->db_data;
+ memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_NUMBER:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ reg->hr_region_num);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+ reg = (struct o2hb_region *)db->db_data;
+ lts = reg->hr_last_timeout_start;
+ /* If 0, it has never been set before */
+ if (lts)
+ lts = jiffies_to_msecs(jiffies - lts);
+ out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_PINNED:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ !!reg->hr_item_pinned);
+ goto done;
+
+ default:
+ goto done;
+ }
+
+ while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+done:
i_size_write(inode, out);
file->private_data = buf;
@@ -966,7 +1270,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
}
#endif /* CONFIG_DEBUG_FS */
-static struct file_operations o2hb_debug_fops = {
+static const struct file_operations o2hb_debug_fops = {
.open = o2hb_debug_open,
.release = o2hb_debug_release,
.read = o2hb_debug_read,
@@ -975,10 +1279,104 @@ static struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- if (o2hb_debug_livenodes)
- debugfs_remove(o2hb_debug_livenodes);
- if (o2hb_debug_dir)
- debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
+ debugfs_remove(o2hb_debug_failedregions);
+ debugfs_remove(o2hb_debug_quorumregions);
+ debugfs_remove(o2hb_debug_liveregions);
+ debugfs_remove(o2hb_debug_livenodes);
+ debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len,
+ int type, int size, int len, void *data)
+{
+ *db = kmalloc(db_len, GFP_KERNEL);
+ if (!*db)
+ return NULL;
+
+ (*db)->db_type = type;
+ (*db)->db_size = size;
+ (*db)->db_len = len;
+ (*db)->db_data = data;
+
+ return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+ &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+ int ret = -ENOMEM;
+
+ o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+ if (!o2hb_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ o2hb_debug_dir,
+ &o2hb_db_livenodes,
+ sizeof(*o2hb_db_livenodes),
+ O2HB_DB_TYPE_LIVENODES,
+ sizeof(o2hb_live_node_bitmap),
+ O2NM_MAX_NODES,
+ o2hb_live_node_bitmap);
+ if (!o2hb_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_liveregions,
+ sizeof(*o2hb_db_liveregions),
+ O2HB_DB_TYPE_LIVEREGIONS,
+ sizeof(o2hb_live_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_live_region_bitmap);
+ if (!o2hb_debug_liveregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_quorumregions =
+ o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_quorumregions,
+ sizeof(*o2hb_db_quorumregions),
+ O2HB_DB_TYPE_QUORUMREGIONS,
+ sizeof(o2hb_quorum_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_quorum_region_bitmap);
+ if (!o2hb_debug_quorumregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_failedregions =
+ o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_failedregions,
+ sizeof(*o2hb_db_failedregions),
+ O2HB_DB_TYPE_FAILEDREGIONS,
+ sizeof(o2hb_failed_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_failed_region_bitmap);
+ if (!o2hb_debug_failedregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ if (ret)
+ o2hb_exit();
+
+ return ret;
}
int o2hb_init(void)
@@ -994,24 +1392,14 @@ int o2hb_init(void)
INIT_LIST_HEAD(&o2hb_node_events);
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+ memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+ memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+ memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+ memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
- o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
- if (!o2hb_debug_dir) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
-
- o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
- S_IFREG|S_IRUSR,
- o2hb_debug_dir, NULL,
- &o2hb_debug_fops);
- if (!o2hb_debug_livenodes) {
- mlog_errno(-ENOMEM);
- debugfs_remove(o2hb_debug_dir);
- return -ENOMEM;
- }
+ o2hb_dependent_users = 0;
- return 0;
+ return o2hb_debug_init();
}
/* if we're already in a callback then we're already serialized by the sem */
@@ -1057,8 +1445,9 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- if (reg->hr_tmp_block)
- kfree(reg->hr_tmp_block);
+ mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
+ kfree(reg->hr_tmp_block);
if (reg->hr_slot_data) {
for (i = 0; i < reg->hr_num_pages; i++) {
@@ -1072,8 +1461,15 @@ static void o2hb_region_release(struct config_item *item)
if (reg->hr_bdev)
blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- if (reg->hr_slots)
- kfree(reg->hr_slots);
+ kfree(reg->hr_slots);
+
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_livenodes);
+ debugfs_remove(reg->hr_debug_livenodes);
+ debugfs_remove(reg->hr_debug_regnum);
+ debugfs_remove(reg->hr_debug_elapsed_time);
+ debugfs_remove(reg->hr_debug_pinned);
+ debugfs_remove(reg->hr_debug_dir);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
@@ -1291,8 +1687,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
- mlog_entry_void();
-
ret = o2hb_read_slots(reg, reg->hr_blocks);
if (ret) {
mlog_errno(ret);
@@ -1314,7 +1708,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
}
out:
- mlog_exit(ret);
return ret;
}
@@ -1327,9 +1720,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
long fd;
int sectsize;
char *p = (char *)page;
- struct file *filp = NULL;
- struct inode *inode = NULL;
+ struct fd f;
+ struct inode *inode;
ssize_t ret = -EINVAL;
+ int live_threshold;
if (reg->hr_bdev)
goto out;
@@ -1346,26 +1740,26 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
if (fd < 0 || fd >= INT_MAX)
goto out;
- filp = fget(fd);
- if (filp == NULL)
+ f = fdget(fd);
+ if (f.file == NULL)
goto out;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out;
+ goto out2;
- inode = igrab(filp->f_mapping->host);
+ inode = igrab(f.file->f_mapping->host);
if (inode == NULL)
- goto out;
+ goto out2;
if (!S_ISBLK(inode->i_mode))
- goto out;
+ goto out3;
- reg->hr_bdev = I_BDEV(filp->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+ reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
+ ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
if (ret) {
reg->hr_bdev = NULL;
- goto out;
+ goto out3;
}
inode = NULL;
@@ -1377,7 +1771,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
"blocksize %u incorrect for device, expected %d",
reg->hr_block_bytes, sectsize);
ret = -EINVAL;
- goto out;
+ goto out3;
}
o2hb_init_region_params(reg);
@@ -1391,13 +1785,13 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = o2hb_map_slot_data(reg);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out3;
}
ret = o2hb_populate_slot_data(reg);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out3;
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
@@ -1406,15 +1800,28 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
* A node is considered live after it has beat LIVE_THRESHOLD
* times. We're not steady until we've given them a chance
* _after_ our first read.
+ * The default threshold is bare minimum so as to limit the delay
+ * during mounts. For global heartbeat, the threshold doubled for the
+ * first region.
*/
- atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
+ live_threshold = O2HB_LIVE_THRESHOLD;
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+ live_threshold <<= 1;
+ spin_unlock(&o2hb_live_lock);
+ }
+ ++live_threshold;
+ atomic_set(&reg->hr_steady_iterations, live_threshold);
+ /* unsteady_iterations is double the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
if (IS_ERR(hb_task)) {
ret = PTR_ERR(hb_task);
mlog_errno(ret);
- goto out;
+ goto out3;
}
spin_lock(&o2hb_live_lock);
@@ -1424,20 +1831,20 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
- /* We got interrupted (hello ptrace!). Clean up */
- spin_lock(&o2hb_live_lock);
- hb_task = reg->hr_task;
- reg->hr_task = NULL;
- spin_unlock(&o2hb_live_lock);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
+ }
- if (hb_task)
- kthread_stop(hb_task);
- goto out;
+ if (reg->hr_aborted_start) {
+ ret = -EIO;
+ goto out3;
}
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
+ if (o2hb_global_heartbeat_active())
+ set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
spin_unlock(&o2hb_live_lock);
if (hb_task)
@@ -1445,11 +1852,15 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
else
ret = -EIO;
+ if (hb_task && o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+out3:
+ iput(inode);
+out2:
+ fdput(f);
out:
- if (filp)
- fput(filp);
- if (inode)
- iput(inode);
if (ret < 0) {
if (reg->hr_bdev) {
blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
@@ -1583,22 +1994,113 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+ int ret = -ENOMEM;
+
+ reg->hr_debug_dir =
+ debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+ if (!reg->hr_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_livenodes =
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ reg->hr_debug_dir,
+ &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap),
+ O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_regnum =
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+ reg->hr_debug_dir,
+ &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER,
+ 0, O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_regnum) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_elapsed_time =
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+ reg->hr_debug_dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+ 0, 0, reg);
+ if (!reg->hr_debug_elapsed_time) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_pinned =
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+ reg->hr_debug_dir,
+ &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED,
+ 0, 0, reg);
+ if (!reg->hr_debug_pinned) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ return ret;
+}
+
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
const char *name)
{
struct o2hb_region *reg = NULL;
+ int ret;
reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
if (reg == NULL)
return ERR_PTR(-ENOMEM);
- config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+ ret = -ENAMETOOLONG;
+ goto free;
+ }
spin_lock(&o2hb_live_lock);
+ reg->hr_region_num = 0;
+ if (o2hb_global_heartbeat_active()) {
+ reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+ O2NM_MAX_REGIONS);
+ if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+ spin_unlock(&o2hb_live_lock);
+ ret = -EFBIG;
+ goto free;
+ }
+ set_bit(reg->hr_region_num, o2hb_region_bitmap);
+ }
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
+ config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+ ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+ if (ret) {
+ config_item_put(&reg->hr_item);
+ goto free;
+ }
+
return &reg->hr_item;
+free:
+ kfree(reg);
+ return ERR_PTR(ret);
}
static void o2hb_heartbeat_group_drop_item(struct config_group *group,
@@ -1606,26 +2108,62 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
{
struct task_struct *hb_task;
struct o2hb_region *reg = to_o2hb_region(item);
+ int quorum_region = 0;
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
reg->hr_task = NULL;
+ reg->hr_item_dropped = 1;
spin_unlock(&o2hb_live_lock);
if (hb_task)
kthread_stop(hb_task);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+ "stopped" : "start aborted"), config_item_name(item),
+ reg->hr_dev_name);
+ }
+
/*
* If we're racing a dev_write(), we need to wake them. They will
* check reg->hr_task
*/
if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ reg->hr_aborted_start = 1;
atomic_set(&reg->hr_steady_iterations, 0);
wake_up(&o2hb_steady_queue);
}
config_item_put(item);
+
+ if (!o2hb_global_heartbeat_active() || !quorum_region)
+ return;
+
+ /*
+ * If global heartbeat active and there are dependent users,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ spin_lock(&o2hb_live_lock);
+
+ if (!o2hb_dependent_users)
+ goto unlock;
+
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ o2hb_region_pin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
struct o2hb_heartbeat_group_attribute {
@@ -1685,6 +2223,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+ char *page)
+{
+ return sprintf(page, "%s\n",
+ o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+ const char *page, size_t count)
+{
+ unsigned int i;
+ int ret;
+ size_t len;
+
+ len = (page[count - 1] == '\n') ? count - 1 : count;
+ if (!len)
+ return -EINVAL;
+
+ for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+ if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ continue;
+
+ ret = o2hb_global_heartbeat_mode_set(i);
+ if (!ret)
+ printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+ o2hb_heartbeat_mode_desc[i]);
+ return count;
+ }
+
+ return -EINVAL;
+
+}
+
static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "dead_threshold",
@@ -1693,12 +2266,21 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
.store = o2hb_heartbeat_group_threshold_store,
};
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "mode",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2hb_heartbeat_group_mode_show,
+ .store = o2hb_heartbeat_group_mode_store,
+};
+
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
&o2hb_heartbeat_group_attr_threshold.attr,
+ &o2hb_heartbeat_group_attr_mode.attr,
NULL,
};
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
.show_attribute = o2hb_heartbeat_group_show,
.store_attribute = o2hb_heartbeat_group_store,
};
@@ -1710,7 +2292,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_hearbeat_group_item_ops,
+ .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -1742,7 +2324,7 @@ void o2hb_free_hb_set(struct config_group *group)
kfree(hs);
}
-/* hb callback registration and issueing */
+/* hb callback registration and issuing */
static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
{
@@ -1767,70 +2349,150 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
}
EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
{
- struct o2hb_region *p, *reg = NULL;
+ int ret = 0, found = 0;
+ struct o2hb_region *reg;
+ char *uuid;
assert_spin_locked(&o2hb_live_lock);
- list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
- if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
- reg = p;
- break;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ uuid = config_item_name(&reg->hr_item);
+
+ /* local heartbeat */
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
+
+ if (reg->hr_item_pinned || reg->hr_item_dropped)
+ goto skip_pin;
+
+ /* Ignore ENOENT only for local hb (userdlm domain) */
+ ret = o2nm_depend_item(&reg->hr_item);
+ if (!ret) {
+ mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+ reg->hr_item_pinned = 1;
+ } else {
+ if (ret == -ENOENT && found)
+ ret = 0;
+ else {
+ mlog(ML_ERROR, "Pin region %s fails with %d\n",
+ uuid, ret);
+ break;
+ }
}
+skip_pin:
+ if (found)
+ break;
}
- return reg;
+ return ret;
}
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
{
- int ret = 0;
struct o2hb_region *reg;
+ char *uuid;
+ int found = 0;
- spin_lock(&o2hb_live_lock);
-
- reg = o2hb_find_region(region_uuid);
- if (!reg)
- ret = -ENOENT;
- spin_unlock(&o2hb_live_lock);
-
- if (ret)
- goto out;
+ assert_spin_locked(&o2hb_live_lock);
- ret = o2nm_depend_this_node();
- if (ret)
- goto out;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
- ret = o2nm_depend_item(&reg->hr_item);
- if (ret)
- o2nm_undepend_this_node();
+ uuid = config_item_name(&reg->hr_item);
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
-out:
- return ret;
+ if (reg->hr_item_pinned) {
+ mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+ o2nm_undepend_item(&reg->hr_item);
+ reg->hr_item_pinned = 0;
+ }
+ if (found)
+ break;
+ }
}
-static void o2hb_region_put(const char *region_uuid)
+static int o2hb_region_inc_user(const char *region_uuid)
{
- struct o2hb_region *reg;
+ int ret = 0;
spin_lock(&o2hb_live_lock);
- reg = o2hb_find_region(region_uuid);
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ ret = o2hb_region_pin(region_uuid);
+ goto unlock;
+ }
+ /*
+ * if global heartbeat active and this is the first dependent user,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ o2hb_dependent_users++;
+ if (o2hb_dependent_users > 1)
+ goto unlock;
+
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ ret = o2hb_region_pin(NULL);
+
+unlock:
spin_unlock(&o2hb_live_lock);
+ return ret;
+}
- if (reg) {
- o2nm_undepend_item(&reg->hr_item);
- o2nm_undepend_this_node();
+void o2hb_region_dec_user(const char *region_uuid)
+{
+ spin_lock(&o2hb_live_lock);
+
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ o2hb_region_unpin(region_uuid);
+ goto unlock;
}
+
+ /*
+ * if global heartbeat active and there are no dependent users,
+ * unpin all quorum regions
+ */
+ o2hb_dependent_users--;
+ if (!o2hb_dependent_users)
+ o2hb_region_unpin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
int o2hb_register_callback(const char *region_uuid,
struct o2hb_callback_func *hc)
{
- struct o2hb_callback_func *tmp;
- struct list_head *iter;
+ struct o2hb_callback_func *f;
struct o2hb_callback *hbcall;
int ret;
@@ -1844,17 +2506,18 @@ int o2hb_register_callback(const char *region_uuid,
}
if (region_uuid) {
- ret = o2hb_region_get(region_uuid);
- if (ret)
+ ret = o2hb_region_inc_user(region_uuid);
+ if (ret) {
+ mlog_errno(ret);
goto out;
+ }
}
down_write(&o2hb_callback_sem);
- list_for_each(iter, &hbcall->list) {
- tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
- if (hc->hc_priority < tmp->hc_priority) {
- list_add_tail(&hc->hc_item, iter);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
+ if (hc->hc_priority < f->hc_priority) {
+ list_add_tail(&hc->hc_item, &f->hc_item);
break;
}
}
@@ -1864,7 +2527,7 @@ int o2hb_register_callback(const char *region_uuid,
up_write(&o2hb_callback_sem);
ret = 0;
out:
- mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
ret, __builtin_return_address(0), hc);
return ret;
}
@@ -1875,7 +2538,7 @@ void o2hb_unregister_callback(const char *region_uuid,
{
BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
- mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
__builtin_return_address(0), hc);
/* XXX Can this happen _with_ a region reference? */
@@ -1883,7 +2546,7 @@ void o2hb_unregister_callback(const char *region_uuid,
return;
if (region_uuid)
- o2hb_region_put(region_uuid);
+ o2hb_region_dec_user(region_uuid);
down_write(&o2hb_callback_sem);
@@ -1960,3 +2623,37 @@ void o2hb_stop_all_regions(void)
spin_unlock(&o2hb_live_lock);
}
EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+ struct o2hb_region *reg;
+ int numregs = 0;
+ char *p;
+
+ spin_lock(&o2hb_live_lock);
+
+ p = region_uuids;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+ if (numregs < max_regions) {
+ memcpy(p, config_item_name(&reg->hr_item),
+ O2HB_MAX_REGION_NAME_LEN);
+ p += O2HB_MAX_REGION_NAME_LEN;
+ }
+ numregs++;
+ }
+
+ spin_unlock(&o2hb_live_lock);
+
+ return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+ return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b4..00ad8e8fea5 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
#define O2HB_REGION_TIMEOUT_MS 2000
+#define O2HB_MAX_REGION_NAME_LEN 32
+
/* number of changes to be seen as live */
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df5416993..07ac24fd925 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
EXPORT_SYMBOL_GPL(mlog_and_bits);
-struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
EXPORT_SYMBOL_GPL(mlog_not_bits);
static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -80,8 +80,6 @@ struct mlog_attribute {
}
static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
- define_mask(ENTRY),
- define_mask(EXIT),
define_mask(TCP),
define_mask(MSG),
define_mask(SOCKET),
@@ -93,24 +91,12 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(DLM_THREAD),
define_mask(DLM_MASTER),
define_mask(DLM_RECOVERY),
- define_mask(AIO),
- define_mask(JOURNAL),
- define_mask(DISK_ALLOC),
- define_mask(SUPER),
- define_mask(FILE_IO),
- define_mask(EXTENT_MAP),
define_mask(DLM_GLUE),
- define_mask(BH_IO),
- define_mask(UPTODATE),
- define_mask(NAMEI),
- define_mask(INODE),
define_mask(VOTE),
- define_mask(DCACHE),
define_mask(CONN),
define_mask(QUORUM),
- define_mask(EXPORT),
- define_mask(XATTR),
- define_mask(QUOTA),
+ define_mask(BASTS),
+ define_mask(CLUSTER),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
@@ -134,7 +120,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
return mlog_mask_store(mlog_attr->mask, buf, count);
}
-static struct sysfs_ops mlog_attr_ops = {
+static const struct sysfs_ops mlog_attr_ops = {
.show = mlog_show,
.store = mlog_store,
};
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7e72a81bc2d..2260fb9e650 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -48,79 +48,64 @@
* only emit the appropriage printk() when the caller passes in a constant
* mask, as is almost always the case.
*
- * All this bitmask nonsense is hidden from the /proc interface so that Joel
- * doesn't have an aneurism. Reading the file gives a straight forward
- * indication of which bits are on or off:
- * ENTRY off
- * EXIT off
+ * All this bitmask nonsense is managed from the files under
+ * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward
+ * indication of which bits are allowed (allow) or denied (off/deny).
+ * ENTRY deny
+ * EXIT deny
* TCP off
* MSG off
* SOCKET off
- * ERROR off
- * NOTICE on
+ * ERROR allow
+ * NOTICE allow
*
* Writing changes the state of a given bit and requires a strictly formatted
* single write() call:
*
- * write(fd, "ENTRY on", 8);
+ * write(fd, "allow", 5);
*
- * would turn the entry bit on. "1" is also accepted in the place of "on", and
- * "off" and "0" behave as expected.
+ * Echoing allow/deny/off string into the logmask files can flip the bits
+ * on or off as expected; here is the bash script for example:
*
- * Some trivial shell can flip all the bits on or off:
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ * echo allow >"$log_mask"/"$node"
+ * done
*
- * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
- * cat $log_mask | (
- * while read bit status; do
- * # $1 is "on" or "off", say
- * echo "$bit $1" > $log_mask
- * done
- * )
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
+ *
+ * debugfs.ocfs2 -l TCP allow
*/
/* for task_struct */
#include <linux/sched.h>
/* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
-#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
-#define ML_EXIT 0x0000000000000002ULL /* func call exit */
-#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
-#define ML_MSG 0x0000000000000008ULL /* net network messages */
-#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
-#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
-#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
-#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
-#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
-#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
-#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
-#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
-#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
-#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
-#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
-#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
-#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
-#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
-#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
-#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
-#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
-#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
-#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
-#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
-#define ML_CONN 0x0000000004000000ULL /* net connection management */
-#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
-#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
-#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
-#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
+#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG 0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */
+#define ML_CONN 0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */
+
/* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
+#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
#ifndef MLOG_MASK_PREFIX
#define MLOG_MASK_PREFIX 0
#endif
@@ -194,9 +179,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
* previous token if args expands to nothing.
*/
#define __mlog_printk(level, fmt, args...) \
- printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current), \
- __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \
- ##args)
+ printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
+ task_pid_nr(current), __mlog_cpu_guess, \
+ __PRETTY_FUNCTION__, __LINE__ , ##args)
#define mlog(mask, fmt, args...) do { \
u64 __m = MLOG_MASK_PREFIX | (mask); \
@@ -214,62 +199,11 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#define mlog_errno(st) do { \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
- _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
+ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
+ _st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
} while (0)
-#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-#define mlog_entry(fmt, args...) do { \
- mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
-} while (0)
-
-#define mlog_entry_void() do { \
- mlog(ML_ENTRY, "ENTRY:\n"); \
-} while (0)
-
-/*
- * We disable this for sparse.
- */
-#if !defined(__CHECKER__)
-#define mlog_exit(st) do { \
- if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
- mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), signed long)) \
- mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
- || __builtin_types_compatible_p(typeof(st), unsigned short) \
- || __builtin_types_compatible_p(typeof(st), unsigned char)) \
- mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), signed int) \
- || __builtin_types_compatible_p(typeof(st), signed short) \
- || __builtin_types_compatible_p(typeof(st), signed char)) \
- mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), long long)) \
- mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
- else \
- mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
-} while (0)
-#else
-#define mlog_exit(st) do { \
- mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
-} while (0)
-#endif
-
-#define mlog_exit_ptr(ptr) do { \
- mlog(ML_EXIT, "EXIT: %p\n", ptr); \
-} while (0)
-
-#define mlog_exit_void() do { \
- mlog(ML_EXIT, "EXIT\n"); \
-} while (0)
-#else
-#define mlog_entry(...) do { } while (0)
-#define mlog_entry_void(...) do { } while (0)
-#define mlog_exit(...) do { } while (0)
-#define mlog_exit_ptr(...) do { } while (0)
-#define mlog_exit_void(...) do { } while (0)
-#endif /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
-
#define mlog_bug_on_msg(cond, fmt, args...) do { \
if (cond) { \
mlog(ML_ERROR, "bug expression: " #cond "\n"); \
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index f8424874fa0..73ba81928bc 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,17 @@
#define O2NET_DEBUG_DIR "o2net"
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
+#define STATS_DEBUG_NAME "stats"
+#define NODES_DEBUG_NAME "connected_nodes"
+
+#define SHOW_SOCK_CONTAINERS 0
+#define SHOW_SOCK_STATS 1
static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +130,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int nst_seq_show(struct seq_file *seq, void *v)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+ ktime_t now;
+ s64 sock, send, status;
spin_lock(&o2net_debug_lock);
nst = next_nst(dummy_nst);
+ if (!nst)
+ goto out;
- if (nst != NULL) {
- /* get_task_comm isn't exported. oh well. */
- seq_printf(seq, "%p:\n"
- " pid: %lu\n"
- " tgid: %lu\n"
- " process name: %s\n"
- " node: %u\n"
- " sc: %p\n"
- " message id: %d\n"
- " message type: %u\n"
- " message key: 0x%08x\n"
- " sock acquiry: %lu.%ld\n"
- " send start: %lu.%ld\n"
- " wait start: %lu.%ld\n",
- nst, (unsigned long)nst->st_task->pid,
- (unsigned long)nst->st_task->tgid,
- nst->st_task->comm, nst->st_node,
- nst->st_sc, nst->st_id, nst->st_msg_type,
- nst->st_msg_key,
- nst->st_sock_time.tv_sec,
- (long)nst->st_sock_time.tv_usec,
- nst->st_send_time.tv_sec,
- (long)nst->st_send_time.tv_usec,
- nst->st_status_time.tv_sec,
- (long)nst->st_status_time.tv_usec);
- }
+ now = ktime_get();
+ sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+ send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+ status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+ /* get_task_comm isn't exported. oh well. */
+ seq_printf(seq, "%p:\n"
+ " pid: %lu\n"
+ " tgid: %lu\n"
+ " process name: %s\n"
+ " node: %u\n"
+ " sc: %p\n"
+ " message id: %d\n"
+ " message type: %u\n"
+ " message key: 0x%08x\n"
+ " sock acquiry: %lld usecs ago\n"
+ " send start: %lld usecs ago\n"
+ " wait start: %lld usecs ago\n",
+ nst, (unsigned long)task_pid_nr(nst->st_task),
+ (unsigned long)nst->st_task->tgid,
+ nst->st_task->comm, nst->st_node,
+ nst->st_sc, nst->st_id, nst->st_msg_type,
+ nst->st_msg_key,
+ (long long)sock,
+ (long long)send,
+ (long long)status);
+out:
spin_unlock(&o2net_debug_lock);
return 0;
@@ -163,7 +175,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations nst_seq_ops = {
+static const struct seq_operations nst_seq_ops = {
.start = nst_seq_start,
.next = nst_seq_next,
.stop = nst_seq_stop,
@@ -207,7 +219,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations nst_seq_fops = {
+static const struct file_operations nst_seq_fops = {
.open = nst_fop_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -228,6 +240,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
spin_unlock(&o2net_debug_lock);
}
+struct o2net_sock_debug {
+ int dbg_ctxt;
+ struct o2net_sock_container *dbg_sock;
+};
+
static struct o2net_sock_container
*next_sc(struct o2net_sock_container *sc_start)
{
@@ -253,7 +270,8 @@ static struct o2net_sock_container
static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
@@ -264,7 +282,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
@@ -276,65 +295,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return sc; /* unused, just needs to be null when done */
}
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s) ((_s)->sc_send_count)
+# define sc_recv_count(_s) ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s) (0U)
+# define sc_recv_count(_s) (0U)
+# define sc_tv_acquiry_total_ns(_s) (0LL)
+# define sc_tv_send_total_ns(_s) (0LL)
+# define sc_tv_status_total_ns(_s) (0LL)
+# define sc_tv_process_total_ns(_s) (0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION 1
+static void sc_show_sock_stats(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ if (!sc)
+ return;
+
+ seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+ sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+ (long long)sc_tv_acquiry_total_ns(sc),
+ (long long)sc_tv_send_total_ns(sc),
+ (long long)sc_tv_status_total_ns(sc),
+ (unsigned long)sc_recv_count(sc),
+ (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ struct inet_sock *inet = NULL;
+ __be32 saddr = 0, daddr = 0;
+ __be16 sport = 0, dport = 0;
+
+ if (!sc)
+ return;
+
+ if (sc->sc_sock) {
+ inet = inet_sk(sc->sc_sock->sk);
+ /* the stack's structs aren't sparse endian clean */
+ saddr = (__force __be32)inet->inet_saddr;
+ daddr = (__force __be32)inet->inet_daddr;
+ sport = (__force __be16)inet->inet_sport;
+ dport = (__force __be16)inet->inet_dport;
+ }
+
+ /* XXX sigh, inet-> doesn't have sparse annotation so any
+ * use of it here generates a warning with -Wbitwise */
+ seq_printf(seq, "%p:\n"
+ " krefs: %d\n"
+ " sock: %pI4:%u -> "
+ "%pI4:%u\n"
+ " remote node: %s\n"
+ " page off: %zu\n"
+ " handshake ok: %u\n"
+ " timer: %lld usecs\n"
+ " data ready: %lld usecs\n"
+ " advance start: %lld usecs\n"
+ " advance stop: %lld usecs\n"
+ " func start: %lld usecs\n"
+ " func stop: %lld usecs\n"
+ " func key: 0x%08x\n"
+ " func type: %u\n",
+ sc,
+ atomic_read(&sc->sc_kref.refcount),
+ &saddr, inet ? ntohs(sport) : 0,
+ &daddr, inet ? ntohs(dport) : 0,
+ sc->sc_node->nd_name,
+ sc->sc_page_off,
+ sc->sc_handshake_ok,
+ (long long)ktime_to_us(sc->sc_tv_timer),
+ (long long)ktime_to_us(sc->sc_tv_data_ready),
+ (long long)ktime_to_us(sc->sc_tv_advance_start),
+ (long long)ktime_to_us(sc->sc_tv_advance_stop),
+ (long long)ktime_to_us(sc->sc_tv_func_start),
+ (long long)ktime_to_us(sc->sc_tv_func_stop),
+ sc->sc_msg_key,
+ sc->sc_msg_type);
+}
static int sc_seq_show(struct seq_file *seq, void *v)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
- if (sc != NULL) {
- struct inet_sock *inet = NULL;
-
- __be32 saddr = 0, daddr = 0;
- __be16 sport = 0, dport = 0;
-
- if (sc->sc_sock) {
- inet = inet_sk(sc->sc_sock->sk);
- /* the stack's structs aren't sparse endian clean */
- saddr = (__force __be32)inet->saddr;
- daddr = (__force __be32)inet->daddr;
- sport = (__force __be16)inet->sport;
- dport = (__force __be16)inet->dport;
- }
-
- /* XXX sigh, inet-> doesn't have sparse annotation so any
- * use of it here generates a warning with -Wbitwise */
- seq_printf(seq, "%p:\n"
- " krefs: %d\n"
- " sock: %pI4:%u -> "
- "%pI4:%u\n"
- " remote node: %s\n"
- " page off: %zu\n"
- " handshake ok: %u\n"
- " timer: %lu.%ld\n"
- " data ready: %lu.%ld\n"
- " advance start: %lu.%ld\n"
- " advance stop: %lu.%ld\n"
- " func start: %lu.%ld\n"
- " func stop: %lu.%ld\n"
- " func key: %u\n"
- " func type: %u\n",
- sc,
- atomic_read(&sc->sc_kref.refcount),
- &saddr, inet ? ntohs(sport) : 0,
- &daddr, inet ? ntohs(dport) : 0,
- sc->sc_node->nd_name,
- sc->sc_page_off,
- sc->sc_handshake_ok,
- TV_SEC_USEC(sc->sc_tv_timer),
- TV_SEC_USEC(sc->sc_tv_data_ready),
- TV_SEC_USEC(sc->sc_tv_advance_start),
- TV_SEC_USEC(sc->sc_tv_advance_stop),
- TV_SEC_USEC(sc->sc_tv_func_start),
- TV_SEC_USEC(sc->sc_tv_func_stop),
- sc->sc_msg_key,
- sc->sc_msg_type);
+ if (sc) {
+ if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+ sc_show_sock_container(seq, sc);
+ else
+ sc_show_sock_stats(seq, sc);
}
-
spin_unlock(&o2net_debug_lock);
return 0;
@@ -344,14 +405,14 @@ static void sc_seq_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations sc_seq_ops = {
+static const struct seq_operations sc_seq_ops = {
.start = sc_seq_start,
.next = sc_seq_next,
.stop = sc_seq_stop,
.show = sc_seq_show,
};
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
{
struct o2net_sock_container *dummy_sc;
struct seq_file *seq;
@@ -369,7 +430,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
goto out;
seq = file->private_data;
- seq->private = dummy_sc;
+ seq->private = sd;
+ sd->dbg_sock = dummy_sc;
o2net_debug_add_sc(dummy_sc);
dummy_sc = NULL;
@@ -382,62 +444,136 @@ out:
static int sc_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- struct o2net_sock_container *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *dummy_sc = sd->dbg_sock;
o2net_debug_del_sc(dummy_sc);
return seq_release_private(inode, file);
}
-static struct file_operations sc_seq_fops = {
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_STATS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
+static const struct file_operations stats_seq_fops = {
+ .open = stats_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
+static const struct file_operations sc_seq_fops = {
.open = sc_fop_open,
.read = seq_read,
.llseek = seq_lseek,
.release = sc_fop_release,
};
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
{
- o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
- if (!o2net_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int i = -1, out = 0;
- nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &nst_seq_fops);
- if (!nst_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ o2net_fill_node_map(map, sizeof(map));
- sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &sc_seq_fops);
- if (!sc_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+ return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+ file->private_data = buf;
return 0;
-bail:
- if (sc_dentry)
- debugfs_remove(sc_dentry);
- if (nst_dentry)
- debugfs_remove(nst_dentry);
- if (o2net_dentry)
- debugfs_remove(o2net_dentry);
- return -ENOMEM;
}
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+ .open = nodes_fop_open,
+ .release = o2net_debug_release,
+ .read = o2net_debug_read,
+ .llseek = generic_file_llseek,
+};
+
void o2net_debugfs_exit(void)
{
- if (sc_dentry)
- debugfs_remove(sc_dentry);
- if (nst_dentry)
- debugfs_remove(nst_dentry);
+ debugfs_remove(nodes_dentry);
+ debugfs_remove(stats_dentry);
+ debugfs_remove(sc_dentry);
+ debugfs_remove(nst_dentry);
+ debugfs_remove(o2net_dentry);
+}
+
+int o2net_debugfs_init(void)
+{
+ umode_t mode = S_IFREG|S_IRUSR;
+
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
if (o2net_dentry)
- debugfs_remove(o2net_dentry);
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nst_seq_fops);
+ if (nst_dentry)
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &sc_seq_fops);
+ if (sc_dentry)
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &stats_seq_fops);
+ if (stats_dentry)
+ nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nodes_fops);
+ if (nodes_dentry)
+ return 0;
+
+ o2net_debugfs_exit();
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7ee6188bc79..441c84e169e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/configfs.h>
@@ -28,13 +29,16 @@
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
-#include "ver.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
* cluster references throughout where nodes are looked up */
struct o2nm_cluster *o2nm_single_cluster = NULL;
+char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+ "reset", /* O2NM_FENCE_RESET */
+ "panic", /* O2NM_FENCE_PANIC */
+};
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
{
@@ -579,6 +583,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
return o2nm_cluster_attr_write(page, count,
&cluster->cl_reconnect_delay_ms);
}
+
+static ssize_t o2nm_cluster_attr_fence_method_read(
+ struct o2nm_cluster *cluster, char *page)
+{
+ ssize_t ret = 0;
+
+ if (cluster)
+ ret = sprintf(page, "%s\n",
+ o2nm_fence_method_desc[cluster->cl_fence_method]);
+ return ret;
+}
+
+static ssize_t o2nm_cluster_attr_fence_method_write(
+ struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+ unsigned int i;
+
+ if (page[count - 1] != '\n')
+ goto bail;
+
+ for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
+ if (count != strlen(o2nm_fence_method_desc[i]) + 1)
+ continue;
+ if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
+ continue;
+ if (cluster->cl_fence_method != i) {
+ printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
+ o2nm_fence_method_desc[i]);
+ cluster->cl_fence_method = i;
+ }
+ return count;
+ }
+
+bail:
+ return -EINVAL;
+}
+
static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "idle_timeout_ms",
@@ -603,10 +644,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
.store = o2nm_cluster_attr_reconnect_delay_ms_write,
};
+static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "fence_method",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2nm_cluster_attr_fence_method_read,
+ .store = o2nm_cluster_attr_fence_method_write,
+};
+
static struct configfs_attribute *o2nm_cluster_attrs[] = {
&o2nm_cluster_attr_idle_timeout_ms.attr,
&o2nm_cluster_attr_keepalive_delay_ms.attr,
&o2nm_cluster_attr_reconnect_delay_ms.attr,
+ &o2nm_cluster_attr_fence_method.attr,
NULL,
};
static ssize_t o2nm_cluster_show(struct config_item *item,
@@ -660,6 +710,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
+ mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
return &node->nd_item;
}
@@ -693,6 +745,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
}
write_unlock(&cluster->cl_nodes_lock);
+ mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+ config_item_name(&node->nd_item));
+
config_item_put(item);
}
@@ -778,6 +833,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
+ cluster->cl_fence_method = O2NM_FENCE_RESET;
ret = &cluster->cl_group;
o2nm_single_cluster = cluster;
@@ -888,8 +944,6 @@ static int __init init_o2nm(void)
{
int ret = -1;
- cluster_print_version();
-
ret = o2hb_init();
if (ret)
goto out;
@@ -927,6 +981,7 @@ out:
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index c992ea0da4a..09ea2d388bb 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,6 +33,12 @@
#include <linux/configfs.h>
#include <linux/rbtree.h>
+enum o2nm_fence_method {
+ O2NM_FENCE_RESET = 0,
+ O2NM_FENCE_PANIC,
+ O2NM_FENCE_METHODS, /* Number of fence methods */
+};
+
struct o2nm_node {
spinlock_t nd_lock;
struct config_item nd_item;
@@ -58,6 +64,7 @@ struct o2nm_cluster {
unsigned int cl_idle_timeout_ms;
unsigned int cl_keepalive_delay_ms;
unsigned int cl_reconnect_delay_ms;
+ enum o2nm_fence_method cl_fence_method;
/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad57..49b594325be 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
/* host name, group name, cluster name all 64 bytes */
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION** Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS 32
+
#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index bbacf7da48a..1ec141e758d 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
* and if they're the last, they fire off the decision.
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
@@ -74,11 +73,23 @@ static void o2quo_fence_self(void)
* threads can still schedule, etc, etc */
o2hb_stop_all_regions();
- printk("ocfs2 is very sorry to be fencing this system by restarting\n");
- emergency_restart();
+ switch (o2nm_single_cluster->cl_fence_method) {
+ case O2NM_FENCE_PANIC:
+ panic("*** ocfs2 is very sorry to be fencing this system by "
+ "panicing ***\n");
+ break;
+ default:
+ WARN_ON(o2nm_single_cluster->cl_fence_method >=
+ O2NM_FENCE_METHODS);
+ case O2NM_FENCE_RESET:
+ printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
+ "system by restarting ***\n");
+ emergency_restart();
+ break;
+ };
}
-/* Indicate that a timeout occured on a hearbeat region write. The
+/* Indicate that a timeout occurred on a hearbeat region write. The
* other nodes in the cluster may consider us dead at that time so we
* want to "fence" ourselves so that we don't scribble on the disk
* after they think they've recovered us. This can't solve all
@@ -250,10 +261,10 @@ void o2quo_hb_still_up(u8 node)
spin_unlock(&qs->qs_lock);
}
-/* This is analagous to hb_up. as a node's connection comes up we delay the
+/* This is analogous to hb_up. as a node's connection comes up we delay the
* quorum decision until we see it heartbeating. the hold will be droped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
* */
void o2quo_conn_up(u8 node)
{
@@ -314,5 +325,7 @@ void o2quo_init(void)
void o2quo_exit(void)
{
- flush_scheduled_work();
+ struct o2quo_state *qs = &o2quo_state;
+
+ flush_work(&qs->qs_work);
}
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1..b7f57271d49 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
static struct kobj_attribute attr_version =
- __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+ __ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
&attr_version.attr,
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
- sysfs_remove_link(NULL, "o2cb");
kset_unregister(o2cb_kset);
}
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
if (!o2cb_kset)
return -ENOMEM;
- /*
- * Create this symlink for backwards compatibility with old
- * versions of ocfs2-tools which look for things in /sys/o2cb.
- */
- ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
- if (ret)
- goto error;
-
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9fbe849f634..681691bc233 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/net.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <asm/uaccess.h>
@@ -72,9 +73,9 @@
#include "tcp_internal.h"
-#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
+#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
- NIPQUAD(sc->sc_node->nd_ipv4_address), \
+ &sc->sc_node->nd_ipv4_address, \
ntohs(sc->sc_node->nd_ipv4_port)
/*
@@ -107,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
/* XXX someday we'll need better accounting */
-static struct socket *o2net_listen_sock = NULL;
+static struct socket *o2net_listen_sock;
/*
* listen work is only queued by the listening socket callbacks on the
@@ -136,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
static void o2net_sc_connect_completed(struct work_struct *work);
static void o2net_rx_until_empty(struct work_struct *work);
static void o2net_shutdown_sc(struct work_struct *work);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_listen_data_ready(struct sock *sk);
static void o2net_sc_send_keep_req(struct work_struct *work);
static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
@@ -153,74 +154,125 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
nst->st_node = node;
}
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_sock_time);
+ nst->st_sock_time = ktime_get();
}
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_send_time);
+ nst->st_send_time = ktime_get();
}
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_status_time);
+ nst->st_status_time = ktime_get();
}
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
{
nst->st_sc = sc;
}
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+ u32 msg_id)
{
nst->st_id = msg_id;
}
-#else /* CONFIG_DEBUG_FS */
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node)
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
{
+ sc->sc_tv_timer = ktime_get();
}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_data_ready = ktime_get();
}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_advance_start = ktime_get();
}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_advance_stop = ktime_get();
}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_func_start = ktime_get();
}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
- u32 msg_id)
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_func_stop = ktime_get();
}
+#else /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
#endif /* CONFIG_DEBUG_FS */
-static inline int o2net_reconnect_delay(void)
+#ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+ return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+ sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+ ktime_sub(ktime_get(),
+ nst->st_status_time));
+ sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+ ktime_sub(nst->st_status_time,
+ nst->st_send_time));
+ sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+ ktime_sub(nst->st_send_time,
+ nst->st_sock_time));
+ sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+ o2net_get_func_run_time(sc));
+ sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
+static inline unsigned int o2net_reconnect_delay(void)
{
return o2nm_single_cluster->cl_reconnect_delay_ms;
}
-static inline int o2net_keepalive_delay(void)
+static inline unsigned int o2net_keepalive_delay(void)
{
return o2nm_single_cluster->cl_keepalive_delay_ms;
}
-static inline int o2net_idle_timeout(void)
+static inline unsigned int o2net_idle_timeout(void)
{
return o2nm_single_cluster->cl_idle_timeout_ms;
}
@@ -252,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn)
static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
{
- int ret = 0;
-
- do {
- if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
- ret = -EAGAIN;
- break;
- }
- spin_lock(&nn->nn_lock);
- ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
- if (ret == 0)
- list_add_tail(&nsw->ns_node_item,
- &nn->nn_status_list);
- spin_unlock(&nn->nn_lock);
- } while (ret == -EAGAIN);
+ int ret;
- if (ret == 0) {
- init_waitqueue_head(&nsw->ns_wq);
- nsw->ns_sys_status = O2NET_ERR_NONE;
- nsw->ns_status = 0;
+ spin_lock(&nn->nn_lock);
+ ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ nsw->ns_id = ret;
+ list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
}
+ spin_unlock(&nn->nn_lock);
+ if (ret < 0)
+ return ret;
- return ret;
+ init_waitqueue_head(&nsw->ns_wq);
+ nsw->ns_sys_status = O2NET_ERR_NONE;
+ nsw->ns_status = 0;
+ return 0;
}
static void o2net_complete_nsw_locked(struct o2net_node *nn,
@@ -355,10 +401,14 @@ static void sc_kref_release(struct kref *kref)
sc->sc_sock = NULL;
}
+ o2nm_undepend_item(&sc->sc_node->nd_item);
o2nm_node_put(sc->sc_node);
sc->sc_node = NULL;
o2net_debug_del_sc(sc);
+
+ if (sc->sc_page)
+ __free_page(sc->sc_page);
kfree(sc);
}
@@ -376,6 +426,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
{
struct o2net_sock_container *sc, *ret = NULL;
struct page *page = NULL;
+ int status = 0;
page = alloc_page(GFP_NOFS);
sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +437,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
o2nm_node_get(node);
sc->sc_node = node;
+ /* pin the node item of the remote node */
+ status = o2nm_depend_item(&node->nd_item);
+ if (status) {
+ mlog_errno(status);
+ o2nm_node_put(node);
+ goto out;
+ }
INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -485,17 +543,18 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
if (was_valid && !valid) {
- printk(KERN_INFO "o2net: no longer connected to "
- SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+ if (old_sc)
+ printk(KERN_NOTICE "o2net: No longer connected to "
+ SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
if (!was_valid && valid) {
o2quo_conn_up(o2net_num_from_nn(nn));
cancel_delayed_work(&nn->nn_connect_expired);
- printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
+ printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
- "connected to" : "accepted connection from",
+ "Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
@@ -505,7 +564,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
* the work queue actually being up. */
if (!valid && o2net_wq) {
unsigned long delay;
- /* delay if we're withing a RECONNECT_DELAY of the
+ /* delay if we're within a RECONNECT_DELAY of the
* last attempt */
delay = (nn->nn_last_connect_attempt +
msecs_to_jiffies(o2net_reconnect_delay()))
@@ -538,15 +597,15 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
/* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
if (sk->sk_user_data) {
struct o2net_sock_container *sc = sk->sk_user_data;
sclog(sc, "data_ready hit\n");
- do_gettimeofday(&sc->sc_tv_data_ready);
+ o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
ready = sc->sc_data_ready;
} else {
@@ -554,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
}
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
/* see o2net_register_callbacks() */
@@ -575,16 +634,19 @@ static void o2net_state_change(struct sock *sk)
state_change = sc->sc_state_change;
switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- o2net_sc_queue_work(sc, &sc->sc_connect_work);
- break;
- default:
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
- break;
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ o2net_sc_queue_work(sc, &sc->sc_connect_work);
+ break;
+ default:
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+ " shutdown, state %d\n",
+ SC_NODEF_ARGS(sc), sk->sk_state);
+ o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+ break;
}
out:
read_unlock(&sk->sk_callback_lock);
@@ -704,32 +766,32 @@ static struct o2net_msg_handler *
o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
struct rb_node **ret_parent)
{
- struct rb_node **p = &o2net_handler_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p = &o2net_handler_tree.rb_node;
+ struct rb_node *parent = NULL;
struct o2net_msg_handler *nmh, *ret = NULL;
int cmp;
- while (*p) {
- parent = *p;
- nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+ while (*p) {
+ parent = *p;
+ nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
cmp = o2net_handler_cmp(nmh, msg_type, key);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else {
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
ret = nmh;
- break;
+ break;
}
- }
+ }
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
+ if (ret_p != NULL)
+ *ret_p = p;
+ if (ret_parent != NULL)
+ *ret_parent = parent;
- return ret;
+ return ret;
}
static void o2net_handler_kref_release(struct kref *kref)
@@ -806,7 +868,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
/* we've had some trouble with handlers seemingly vanishing. */
mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
&parent) == NULL,
- "couldn't find handler we *just* registerd "
+ "couldn't find handler we *just* registered "
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
@@ -854,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
- int ret;
- mm_segment_t oldfs;
- struct kvec vec = {
- .iov_len = len,
- .iov_base = data,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&vec,
- .msg_flags = MSG_DONTWAIT,
- };
-
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
- set_fs(oldfs);
-
- return ret;
+ struct kvec vec = { .iov_len = len, .iov_base = data, };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+ return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- mm_segment_t oldfs;
- struct msghdr msg = {
- .msg_iov = (struct iovec *)vec,
- .msg_iovlen = veclen,
- };
+ struct msghdr msg;
if (sock == NULL) {
ret = -EINVAL;
goto out;
}
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_sendmsg(sock, &msg, total);
- set_fs(oldfs);
- if (ret != total) {
- mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
- total);
- if (ret >= 0)
- ret = -EPIPE; /* should be smarter, I bet */
- goto out;
- }
-
- ret = 0;
+ ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+ if (likely(ret == total))
+ return 0;
+ mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+ if (ret >= 0)
+ ret = -EPIPE; /* should be smarter, I bet */
out:
- if (ret < 0)
- mlog(0, "returning error: %d\n", ret);
+ mlog(0, "returning error: %d\n", ret);
return ret;
}
@@ -930,7 +965,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
cond_resched();
continue;
}
- mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
+ mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
" failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
o2net_ensure_shutdown(nn, sc, 0);
break;
@@ -971,10 +1006,29 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
return ret;
}
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct o2net_sock_container *sc;
+ int node, ret;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+ for (node = 0; node < O2NM_MAX_NODES; ++node) {
+ o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!ret) {
+ set_bit(node, map);
+ sc_put(sc);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
- int ret, error = 0;
+ int ret = 0;
struct o2net_msg *msg = NULL;
size_t veclen, caller_bytes = 0;
struct kvec *vec = NULL;
@@ -1015,10 +1069,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
o2net_set_nst_sock_time(&nst);
- ret = wait_event_interruptible(nn->nn_sc_wq,
- o2net_tx_can_proceed(nn, &sc, &error));
- if (!ret && error)
- ret = error;
+ wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
if (ret)
goto out;
@@ -1070,6 +1121,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
o2net_set_nst_status_time(&nst);
wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+ o2net_update_send_stats(&nst, sc);
+
/* Note that we avoid overwriting the callers status return
* variable if a system error was reported on the other
* side. Callers beware. */
@@ -1083,10 +1136,8 @@ out:
o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
if (sc)
sc_put(sc);
- if (vec)
- kfree(vec);
- if (msg)
- kfree(msg);
+ kfree(vec);
+ kfree(msg);
o2net_complete_nsw(nn, &nsw, 0, 0, 0);
return ret;
}
@@ -1183,13 +1234,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
if (syserr != O2NET_ERR_NONE)
goto out_respond;
- do_gettimeofday(&sc->sc_tv_func_start);
+ o2net_set_func_start_time(sc);
sc->sc_msg_key = be32_to_cpu(hdr->key);
sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
be16_to_cpu(hdr->data_len),
nmh->nh_func_data, &ret_data);
- do_gettimeofday(&sc->sc_tv_func_stop);
+ o2net_set_func_stop_time(sc);
+
+ o2net_update_recv_stats(sc);
out_respond:
/* this destroys the hdr, so don't use it after this */
@@ -1220,11 +1273,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
- mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
- "version %llu but %llu is required, disconnecting\n",
- SC_NODEF_ARGS(sc),
- (unsigned long long)be64_to_cpu(hand->protocol_version),
- O2NET_PROTOCOL_VERSION);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+ "protocol version %llu but %llu is required. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ (unsigned long long)be64_to_cpu(hand->protocol_version),
+ O2NET_PROTOCOL_VERSION);
/* don't bother reconnecting if its the wrong version. */
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1238,33 +1291,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
*/
if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
o2net_idle_timeout()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_idle_timeout_ms),
- o2net_idle_timeout());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+ "idle timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_idle_timeout_ms),
+ o2net_idle_timeout());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
o2net_keepalive_delay()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_keepalive_delay_ms),
- o2net_keepalive_delay());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+ "delay of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
+ o2net_keepalive_delay());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
O2HB_MAX_WRITE_TIMEOUT_MS) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
- O2HB_MAX_WRITE_TIMEOUT_MS);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+ "timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+ O2HB_MAX_WRITE_TIMEOUT_MS);
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
@@ -1300,7 +1353,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
size_t datalen;
sclog(sc, "receiving\n");
- do_gettimeofday(&sc->sc_tv_advance_start);
+ o2net_set_advance_start_time(sc);
if (unlikely(sc->sc_handshake_ok == 0)) {
if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1375,7 +1428,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
out:
sclog(sc, "ret = %d\n", ret);
- do_gettimeofday(&sc->sc_tv_advance_stop);
+ o2net_set_advance_stop_time(sc);
return ret;
}
@@ -1475,27 +1528,16 @@ static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
- struct timeval now;
+#ifdef CONFIG_DEBUG_FS
+ unsigned long msecs = ktime_to_ms(ktime_get()) -
+ ktime_to_ms(sc->sc_tv_timer);
+#else
+ unsigned long msecs = o2net_idle_timeout();
+#endif
- do_gettimeofday(&now);
-
- printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
- "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
- o2net_idle_timeout() / 1000,
- o2net_idle_timeout() % 1000);
- mlog(ML_NOTICE, "here are some times that might help debug the "
- "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
- "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
- sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
- now.tv_sec, (long) now.tv_usec,
- sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
- sc->sc_tv_advance_start.tv_sec,
- (long) sc->sc_tv_advance_start.tv_usec,
- sc->sc_tv_advance_stop.tv_sec,
- (long) sc->sc_tv_advance_stop.tv_usec,
- sc->sc_msg_key, sc->sc_msg_type,
- sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
- sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+ printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+ "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+ msecs / 1000, msecs % 1000);
/*
* Initialize the nn_timeout so that the next connection attempt
@@ -1511,7 +1553,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
msecs_to_jiffies(o2net_keepalive_delay()));
- do_gettimeofday(&sc->sc_tv_timer);
+ o2net_set_sock_timer(sc);
mod_timer(&sc->sc_idle_timeout,
jiffies + msecs_to_jiffies(o2net_idle_timeout()));
}
@@ -1627,13 +1669,12 @@ static void o2net_start_connect(struct work_struct *work)
ret = 0;
out:
- if (ret) {
- mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
- "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+ if (ret && sc) {
+ printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+ " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
- if (sc)
- o2net_ensure_shutdown(nn, sc, 0);
+ o2net_ensure_shutdown(nn, sc, 0);
}
if (sc)
sc_put(sc);
@@ -1652,8 +1693,8 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
- mlog(ML_ERROR, "no connection established with node %u after "
- "%u.%u seconds, giving up and returning errors.\n",
+ printk(KERN_NOTICE "o2net: No connection established with "
+ "node %u after %u.%u seconds, giving up.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
@@ -1696,6 +1737,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
{
o2quo_hb_down(node_num);
+ if (!node)
+ return;
+
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
@@ -1709,6 +1753,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
o2quo_hb_up(node_num);
+ BUG_ON(!node);
+
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
@@ -1753,16 +1799,18 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
+ struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1774,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1790,17 +1839,25 @@ static int o2net_accept_one(struct socket *sock)
node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
if (node == NULL) {
- mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+ "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
- if (o2nm_this_node() > node->nd_num) {
- mlog(ML_NOTICE, "unexpected connect attempted from a lower "
- "numbered node '%s' at " "%pI4:%d with num %u\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port), node->nd_num);
+ if (o2nm_this_node() >= node->nd_num) {
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ if (local_node)
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt "
+ "seen at node '%s' (%u, %pI4:%d) from "
+ "node '%s' (%u, %pI4:%d)\n",
+ local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port),
+ node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1825,10 +1882,10 @@ static int o2net_accept_one(struct socket *sock)
ret = 0;
spin_unlock(&nn->nn_lock);
if (ret) {
- mlog(ML_NOTICE, "attempt to connect from node '%s' at "
- "%pI4:%d but it already has an open connection\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+ "at %pI4:%d but it already has an open connection\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
goto out;
}
@@ -1857,21 +1914,48 @@ out:
sock_release(new_sock);
if (node)
o2nm_node_put(node);
+ if (local_node)
+ o2nm_node_put(local_node);
if (sc)
sc_put(sc);
return ret;
}
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+ int err;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data;
@@ -1880,18 +1964,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
goto out;
}
- /* ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the acceptor has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket */
+ /* This callback may called twice when a new connection
+ * is being established as a child socket inherits everything
+ * from a parent LISTEN socket, including the data_ready cb of
+ * the parent. This leads to a hazard. In o2net_accept_one()
+ * we are still initializing the child socket but have not
+ * changed the inherited data_ready callback yet when
+ * data starts arriving.
+ * We avoid this hazard by checking the state.
+ * For the listening socket, the state will be TCP_LISTEN; for the new
+ * socket, will be TCP_ESTABLISHED. Also, in this case,
+ * sk->sk_user_data is not a valid function pointer.
+ */
+
if (sk->sk_state == TCP_LISTEN) {
- mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
+ } else {
+ ready = NULL;
}
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ if (ready != NULL)
+ ready(sk);
}
static int o2net_open_listening_sock(__be32 addr, __be16 port)
@@ -1906,7 +2001,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0) {
- mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+ printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
goto out;
}
@@ -1920,19 +2015,18 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
o2net_listen_sock = sock;
INIT_WORK(&o2net_listen_work, o2net_accept_many);
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
- mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
- "ret=%d\n", &addr, ntohs(port), ret);
+ printk(KERN_ERR "o2net: Error %d while binding socket at "
+ "%pI4:%u\n", ret, &addr, ntohs(port));
goto out;
}
ret = sock->ops->listen(sock, 64);
- if (ret < 0) {
- mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
- &addr, ntohs(port), ret);
- }
+ if (ret < 0)
+ printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+ ret, &addr, ntohs(port));
out:
if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b..dc024367110 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -32,10 +32,10 @@
* on their number */
#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
-/*
+/*
* This version number represents quite a lot, unfortunately. It not
* only represents the raw network message protocol on the wire but also
- * locking semantics of the file system using the protocol. It should
+ * locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
* With version 11, we separate out the filesystem locking portion. The
@@ -129,7 +129,7 @@ struct o2net_node {
struct o2net_sock_container {
struct kref sc_kref;
- /* the next two are vaild for the life time of the sc */
+ /* the next two are valid for the life time of the sc */
struct socket *sc_sock;
struct o2nm_node *sc_node;
@@ -165,19 +165,28 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
- void (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
- struct list_head sc_net_debug_item;
-#endif
- struct timeval sc_tv_timer;
- struct timeval sc_tv_data_ready;
- struct timeval sc_tv_advance_start;
- struct timeval sc_tv_advance_stop;
- struct timeval sc_tv_func_start;
- struct timeval sc_tv_func_stop;
+ void (*sc_data_ready)(struct sock *sk);
+
u32 sc_msg_key;
u16 sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+ struct list_head sc_net_debug_item;
+ ktime_t sc_tv_timer;
+ ktime_t sc_tv_data_ready;
+ ktime_t sc_tv_advance_start;
+ ktime_t sc_tv_advance_stop;
+ ktime_t sc_tv_func_start;
+ ktime_t sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+ ktime_t sc_tv_acquiry_total;
+ ktime_t sc_tv_send_total;
+ ktime_t sc_tv_status_total;
+ u32 sc_send_count;
+ u32 sc_recv_count;
+ ktime_t sc_tv_process_total;
+#endif
struct mutex sc_send_lock;
};
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
u32 st_msg_type;
u32 st_msg_key;
u8 st_node;
- struct timeval st_sock_time;
- struct timeval st_send_time;
- struct timeval st_status_time;
+ ktime_t st_sock_time;
+ ktime_t st_send_time;
+ ktime_t st_status_time;
};
#else
struct o2net_send_tracking {
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad..00000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c..00000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031..e2e05a106be 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
#include <linux/slab.h>
#include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_DCACHE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -38,24 +37,48 @@
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
-#include "super.h"
+#include "ocfs2_trace.h"
+
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+ unsigned long gen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ BUG_ON(dentry->d_inode);
+ dentry->d_fsdata = (void *)gen;
+}
-static int ocfs2_dentry_revalidate(struct dentry *dentry,
- struct nameidata *nd)
+static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode;
int ret = 0; /* if all else fails, just return false */
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
- mlog_entry("(0x%p, '%.*s')\n", dentry,
- dentry->d_name.len, dentry->d_name.name);
+ inode = dentry->d_inode;
+ osb = OCFS2_SB(dentry->d_sb);
- /* Never trust a negative dentry - force a new lookup. */
+ trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+ dentry->d_name.name);
+
+ /* For a negative dentry -
+ * check the generation number of the parent and compare with the
+ * one stored in the inode.
+ */
if (inode == NULL) {
- mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
- dentry->d_name.name);
- goto bail;
+ unsigned long gen = (unsigned long) dentry->d_fsdata;
+ unsigned long pgen;
+ spin_lock(&dentry->d_lock);
+ pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ spin_unlock(&dentry->d_lock);
+ trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+ dentry->d_name.name,
+ pgen, gen);
+ if (gen != pgen)
+ goto bail;
+ goto valid;
}
BUG_ON(!osb);
@@ -67,8 +90,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
/* did we or someone else delete this inode? */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
- mlog(0, "inode (%llu) deleted, returning false\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_dentry_revalidate_delete(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -78,18 +101,27 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
* inode nlink hits zero, it never goes back.
*/
if (inode->i_nlink == 0) {
- mlog(0, "Inode %llu orphaned, returning false "
- "dir = %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- S_ISDIR(inode->i_mode));
+ trace_ocfs2_dentry_revalidate_orphaned(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ S_ISDIR(inode->i_mode));
goto bail;
}
+ /*
+ * If the last lookup failed to create dentry lock, let us
+ * redo it.
+ */
+ if (!dentry->d_fsdata) {
+ trace_ocfs2_dentry_revalidate_nofsdata(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ goto bail;
+ }
+
+valid:
ret = 1;
bail:
- mlog_exit(ret);
-
+ trace_ocfs2_dentry_revalidate_ret(ret);
return ret;
}
@@ -137,28 +169,24 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
int skip_unhashed)
{
- struct list_head *p;
- struct dentry *dentry = NULL;
-
- spin_lock(&dcache_lock);
-
- list_for_each(p, &inode->i_dentry) {
- dentry = list_entry(p, struct dentry, d_alias);
+ struct dentry *dentry;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
- mlog(0, "dentry found: %.*s\n",
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_find_local_alias(dentry->d_name.len,
+ dentry->d_name.name);
- dget_locked(dentry);
- break;
+ dget_dlock(dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&inode->i_lock);
+ return dentry;
}
-
- dentry = NULL;
+ spin_unlock(&dentry->d_lock);
}
-
- spin_unlock(&dcache_lock);
-
- return dentry;
+ spin_unlock(&inode->i_lock);
+ return NULL;
}
DEFINE_SPINLOCK(dentry_attach_lock);
@@ -203,9 +231,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
struct dentry *alias;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
- mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
- dentry->d_name.len, dentry->d_name.name,
- (unsigned long long)parent_blkno, dl);
+ trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)parent_blkno, dl);
/*
* Negative dentry. We ignore these for now.
@@ -216,6 +243,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
+ if (!dentry->d_inode && dentry->d_fsdata) {
+ /* Converting a negative dentry to positive
+ Clear dentry->d_fsdata */
+ dentry->d_fsdata = dl = NULL;
+ }
+
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -249,7 +282,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
- mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+ trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+ (unsigned long long)parent_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto out_attach;
}
@@ -310,34 +345,6 @@ out_attach:
return ret;
}
-static DEFINE_SPINLOCK(dentry_list_lock);
-
-/* We limit the number of dentry locks to drop in one go. We have
- * this limit so that we don't starve other users of ocfs2_wq. */
-#define DL_INODE_DROP_COUNT 64
-
-/* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
-{
- struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
- dentry_lock_work);
- struct ocfs2_dentry_lock *dl;
- int drop_count = DL_INODE_DROP_COUNT;
-
- spin_lock(&dentry_list_lock);
- while (osb->dentry_lock_list && drop_count--) {
- dl = osb->dentry_lock_list;
- osb->dentry_lock_list = dl->dl_next;
- spin_unlock(&dentry_list_lock);
- iput(dl->dl_inode);
- kfree(dl);
- spin_lock(&dentry_list_lock);
- }
- if (osb->dentry_lock_list)
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- spin_unlock(&dentry_list_lock);
-}
-
/*
* ocfs2_dentry_iput() and friends.
*
@@ -362,23 +369,16 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
+ iput(dl->dl_inode);
ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
ocfs2_lock_res_free(&dl->dl_lockres);
-
- /* We leave dropping of inode reference to ocfs2_wq as that can
- * possibly lead to inode deletion which gets tricky */
- spin_lock(&dentry_list_lock);
- if (!osb->dentry_lock_list)
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- dl->dl_next = osb->dentry_lock_list;
- osb->dentry_lock_list = dl;
- spin_unlock(&dentry_list_lock);
+ kfree(dl);
}
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
- int unlock;
+ int unlock = 0;
BUG_ON(dl->dl_count == 0);
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98..55f58892b15 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
extern const struct dentry_operations ocfs2_dentry_ops;
struct ocfs2_dentry_lock {
- /* Use count of dentry lock */
unsigned int dl_count;
- union {
- /* Linked list of dentry locks to release */
- struct ocfs2_dentry_lock *dl_next;
- u64 dl_parent_blkno;
- };
+ u64 dl_parent_blkno;
/*
* The ocfs2_dentry_lock keeps an inode reference until
@@ -52,8 +47,6 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
-void ocfs2_drop_dl_inodes(struct work_struct *work);
-
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
@@ -61,5 +54,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir);
extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c5752305627..0717662b4ae 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -43,7 +43,6 @@
#include <linux/quotaops.h>
#include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -61,13 +60,13 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static unsigned char ocfs2_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -176,7 +175,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dir_block_trailer *trailer;
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -322,21 +321,23 @@ static int ocfs2_check_dir_entry(struct inode * dir,
const char *error_msg = NULL;
const int rlen = le16_to_cpu(de->rec_len);
- if (rlen < OCFS2_DIR_REC_LEN(1))
+ if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
+ else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+ else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ else if (unlikely(
+ ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
error_msg = "directory entry across blocks";
- if (error_msg != NULL)
+ if (unlikely(error_msg != NULL))
mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
"offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
de->name_len);
+
return error_msg == NULL ? 1 : 0;
}
@@ -354,7 +355,7 @@ static inline int ocfs2_match(int len,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
struct inode *dir,
const char *name, int namelen,
unsigned long offset,
@@ -367,8 +368,6 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
int de_len;
int ret = 0;
- mlog_entry_void();
-
de_buf = first_de;
dlimit = de_buf + bytes;
@@ -402,7 +401,7 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
}
bail:
- mlog_exit(ret);
+ trace_ocfs2_search_dirblock(ret);
return ret;
}
@@ -447,8 +446,7 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
* We don't validate dirents here, that's handled
* in-place when the code walks them.
*/
- mlog(0, "Validating dirblock %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -564,7 +562,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
int ret;
struct buffer_head *tmp = *bh;
- ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+ ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
+ ocfs2_validate_dir_block);
if (ret) {
mlog_errno(ret);
goto out;
@@ -622,7 +621,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
u64 blkno = le64_to_cpu(di->i_dx_root);
struct buffer_head *tmp = *dx_root_bh;
- ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+ ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+ ocfs2_validate_dx_root);
/* If ocfs2_read_block() got us a new bh, pass it up. */
if (!ret && !*dx_root_bh)
@@ -662,7 +662,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
int ret;
struct buffer_head *tmp = *dx_leaf_bh;
- ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+ ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
+ ocfs2_validate_dx_leaf);
/* If ocfs2_read_block() got us a new bh, pass it up. */
if (!ret && !*dx_leaf_bh)
@@ -680,7 +681,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
{
int ret;
- ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+ ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
ocfs2_validate_dx_leaf);
if (ret)
mlog_errno(ret);
@@ -703,8 +704,6 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
int num = 0;
int nblocks, i, err;
- mlog_entry_void();
-
sb = dir->i_sb;
nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -785,7 +784,7 @@ cleanup_and_exit:
for (; ra_ptr < ra_max; ra_ptr++)
brelse(bh_use[ra_ptr]);
- mlog_exit_ptr(ret);
+ trace_ocfs2_find_entry_el(ret);
return ret;
}
@@ -802,7 +801,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
struct ocfs2_extent_rec *rec = NULL;
if (el->l_tree_depth) {
- ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -946,11 +946,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
goto out;
}
- mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
- "returns: %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- namelen, name, hinfo->major_hash, hinfo->minor_hash,
- (unsigned long long)phys);
+ trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ namelen, name, hinfo->major_hash,
+ hinfo->minor_hash, (unsigned long long)phys);
ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
if (ret) {
@@ -960,9 +958,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
- mlog(0, "leaf info: num_used: %d, count: %d\n",
- le16_to_cpu(dx_leaf->dl_list.de_num_used),
- le16_to_cpu(dx_leaf->dl_list.de_count));
+ trace_ocfs2_dx_dir_search_leaf_info(
+ le16_to_cpu(dx_leaf->dl_list.de_num_used),
+ le16_to_cpu(dx_leaf->dl_list.de_count));
entry_list = &dx_leaf->dl_list;
@@ -1133,7 +1131,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
access = ocfs2_journal_access_di;
- ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = access(handle, INODE_CACHE(dir), de_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1161,8 +1160,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
int i, status = -ENOENT;
ocfs2_journal_access_func access = ocfs2_journal_access_db;
- mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
-
if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
access = ocfs2_journal_access_di;
@@ -1176,7 +1173,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
goto bail;
}
if (de == de_del) {
- status = access(handle, dir, bh,
+ status = access(handle, INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
status = -EIO;
@@ -1186,10 +1183,9 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
if (pde)
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
- else
- de->inode = 0;
+ de->inode = 0;
dir->i_version++;
- status = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
goto bail;
}
i += le16_to_cpu(de->rec_len);
@@ -1197,7 +1193,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
}
bail:
- mlog_exit(status);
return status;
}
@@ -1326,7 +1321,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
* the entry count needs to be updated. Also, we might be
* adding to the start of the free list.
*/
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1334,7 +1329,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
}
if (!ocfs2_dx_root_inline(dx_root)) {
- ret = ocfs2_journal_access_dl(handle, dir,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
lookup->dl_dx_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -1343,8 +1338,8 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
}
}
- mlog(0, "Dir %llu: delete entry at index: %d\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+ trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ index);
ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
@@ -1493,7 +1488,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
int ret;
struct ocfs2_dx_leaf *dx_leaf;
- ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1523,7 +1518,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
struct ocfs2_dx_root_block *dx_root;
struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1627,8 +1622,6 @@ int __ocfs2_add_entry(handle_t *handle,
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
- mlog_entry_void();
-
if (!namelen)
return -EINVAL;
@@ -1645,11 +1638,13 @@ int __ocfs2_add_entry(handle_t *handle,
*/
if (ocfs2_free_list_at_root(lookup)) {
bh = lookup->dl_dx_root_bh;
- retval = ocfs2_journal_access_dr(handle, dir, bh,
+ retval = ocfs2_journal_access_dr(handle,
+ INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
} else {
bh = lookup->dl_prev_leaf_bh;
- retval = ocfs2_journal_access_db(handle, dir, bh,
+ retval = ocfs2_journal_access_db(handle,
+ INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
}
if (retval) {
@@ -1700,11 +1695,13 @@ int __ocfs2_add_entry(handle_t *handle,
}
if (insert_bh == parent_fe_bh)
- status = ocfs2_journal_access_di(handle, dir,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
else {
- status = ocfs2_journal_access_db(handle, dir,
+ status = ocfs2_journal_access_db(handle,
+ INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1743,7 +1740,7 @@ int __ocfs2_add_entry(handle_t *handle,
ocfs2_recalc_free_list(dir, handle, lookup);
dir->i_version++;
- status = ocfs2_journal_dirty(handle, insert_bh);
+ ocfs2_journal_dirty(handle, insert_bh);
retval = 0;
goto bail;
}
@@ -1756,18 +1753,18 @@ int __ocfs2_add_entry(handle_t *handle,
* from ever getting here. */
retval = -ENOSPC;
bail:
+ if (retval)
+ mlog_errno(retval);
- mlog_exit(retval);
return retval;
}
static int ocfs2_dir_foreach_blk_id(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx)
{
- int ret, i, filldir_ret;
- unsigned long offset = *f_pos;
+ int ret, i;
+ unsigned long offset = ctx->pos;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
@@ -1783,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
di = (struct ocfs2_dinode *)di_bh->b_data;
data = &di->id2.i_data;
- while (*f_pos < i_size_read(inode)) {
-revalidate:
+ while (ctx->pos < i_size_read(inode)) {
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1804,50 +1800,31 @@ revalidate:
break;
i += le16_to_cpu(de->rec_len);
}
- *f_pos = offset = i;
+ ctx->pos = offset = i;
*f_version = inode->i_version;
}
- de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
- if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+ de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+ if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
/* On error, skip the f_pos to the end. */
- *f_pos = i_size_read(inode);
- goto out;
+ ctx->pos = i_size_read(inode);
+ break;
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- filldir_ret = filldir(priv, de->name,
- de->name_len,
- *f_pos,
- le64_to_cpu(de->inode),
- d_type);
- if (filldir_ret) {
- if (filldir_err)
- *filldir_err = filldir_ret;
- break;
- }
- if (version != *f_version)
- goto revalidate;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), d_type))
+ goto out;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
-
out:
brelse(di_bh);
-
return 0;
}
@@ -1857,27 +1834,26 @@ out:
*/
static int ocfs2_dir_foreach_blk_el(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
- int error = 0;
unsigned long offset, blk, last_ra_blk = 0;
- int i, stored;
+ int i;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
+ int stored = 0;
- stored = 0;
bh = NULL;
- offset = (*f_pos) & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && *f_pos < i_size_read(inode)) {
- blk = (*f_pos) >> sb->s_blocksize_bits;
+ while (ctx->pos < i_size_read(inode)) {
+ blk = ctx->pos >> sb->s_blocksize_bits;
if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
/* Skip the corrupt dirblock and keep trying */
- *f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -1899,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
ra_sectors = 8;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1919,93 +1894,64 @@ revalidate:
i += le16_to_cpu(de->rec_len);
}
offset = i;
- *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
*f_version = inode->i_version;
}
- while (!error && *f_pos < i_size_read(inode)
+ while (ctx->pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
/* On error, skip the f_pos to the
next block. */
- *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
brelse(bh);
- goto out;
+ continue;
}
- offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- unsigned long version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- error = filldir(priv, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- *f_pos,
le64_to_cpu(de->inode),
- d_type);
- if (error) {
- if (filldir_err)
- *filldir_err = error;
- break;
+ d_type)) {
+ brelse(bh);
+ return 0;
}
- if (version != *f_version)
- goto revalidate;
- stored ++;
+ stored++;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ offset += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
offset = 0;
brelse(bh);
bh = NULL;
+ if (!persist && stored)
+ break;
}
-
- stored = 0;
-out:
- return stored;
+ return 0;
}
static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
- loff_t *f_pos, void *priv, filldir_t filldir,
- int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
- filldir, filldir_err);
-
- return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
- filldir_err);
+ return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+ return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
}
/*
* This is intended to be called from inside other kernel functions,
* so we fake some arguments.
*/
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir)
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
{
- int ret = 0, filldir_err = 0;
u64 version = inode->i_version;
-
- while (*f_pos < i_size_read(inode)) {
- ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
- filldir, &filldir_err);
- if (ret || filldir_err)
- break;
- }
-
- if (ret > 0)
- ret = -EIO;
-
+ ocfs2_dir_foreach_blk(inode, &version, ctx, true);
return 0;
}
@@ -2013,16 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
* ocfs2_readdir()
*
*/
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
int lock_level = 0;
- mlog_entry("dirino=%llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
@@ -2038,13 +1983,13 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
goto bail_nolock;
}
- error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
- dirent, filldir, NULL);
+ error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
+ if (error)
+ mlog_errno(error);
bail_nolock:
- mlog_exit(error);
return error;
}
@@ -2060,8 +2005,8 @@ int ocfs2_find_files_on_disk(const char *name,
{
int status = -ENOENT;
- mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_find_entry(name, namelen, inode, lookup);
if (status)
@@ -2105,8 +2050,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
int ret;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry("dir %llu, name '%.*s'\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+ trace_ocfs2_check_dir_for_entry(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
ret = -EEXIST;
if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2116,11 +2061,13 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
bail:
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
struct ocfs2_empty_dir_priv {
+ struct dir_context ctx;
unsigned seen_dot;
unsigned seen_dot_dot;
unsigned seen_other;
@@ -2205,10 +2152,9 @@ out:
int ocfs2_empty_dir(struct inode *inode)
{
int ret;
- loff_t start = 0;
- struct ocfs2_empty_dir_priv priv;
-
- memset(&priv, 0, sizeof(priv));
+ struct ocfs2_empty_dir_priv priv = {
+ .ctx.actor = ocfs2_empty_dir_filldir,
+ };
if (ocfs2_dir_indexed(inode)) {
ret = ocfs2_empty_dir_dx(inode, &priv);
@@ -2220,7 +2166,7 @@ int ocfs2_empty_dir(struct inode *inode)
*/
}
- ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+ ret = ocfs2_dir_foreach(inode, &priv.ctx);
if (ret)
mlog_errno(ret);
@@ -2280,7 +2226,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
struct ocfs2_inline_data *data = &di->id2.i_data;
unsigned int size = le16_to_cpu(data->id_count);
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -2288,15 +2234,10 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
}
ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
-
ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
i_size_write(inode, size);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2320,8 +2261,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
struct buffer_head *new_bh = NULL;
struct ocfs2_dir_entry *de;
- mlog_entry_void();
-
if (ocfs2_new_dir_wants_trailer(inode))
size = ocfs2_dir_trailer_blk_off(parent->i_sb);
@@ -2332,9 +2271,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
- status = ocfs2_journal_access_db(handle, inode, new_bh,
+ status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -2357,14 +2296,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_init_dir_trailer(inode, new_bh, size);
}
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_bh);
i_size_write(inode, inode->i_sb->s_blocksize);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -2380,7 +2315,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
bail:
brelse(new_bh);
- mlog_exit(status);
return status;
}
@@ -2395,32 +2329,32 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
int ret;
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
u16 dr_suballoc_bit;
- u64 dr_blkno;
+ u64 suballoc_loc, dr_blkno;
unsigned int num_bits;
struct buffer_head *dx_root_bh = NULL;
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dir_block_trailer *trailer =
ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
- &num_bits, &dr_blkno);
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &dr_suballoc_bit, &num_bits, &dr_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
- mlog(0, "Dir %llu, attach new index block: %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)dr_blkno);
+ trace_ocfs2_dx_dir_attach_index(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)dr_blkno);
dx_root_bh = sb_getblk(osb->sb, dr_blkno);
if (dx_root_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
- ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
@@ -2430,7 +2364,8 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
memset(dx_root, 0, osb->sb->s_blocksize);
strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
- dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+ dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2449,12 +2384,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
dx_root->dr_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
}
+ ocfs2_journal_dirty(handle, dx_root_bh);
- ret = ocfs2_journal_dirty(handle, dx_root_bh);
- if (ret)
- mlog_errno(ret);
-
- ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
@@ -2463,12 +2395,12 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
di->i_dx_root = cpu_to_le64(dr_blkno);
+ spin_lock(&OCFS2_I(dir)->ip_lock);
OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
*ret_dx_root_bh = dx_root_bh;
dx_root_bh = NULL;
@@ -2490,14 +2422,14 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
for (i = 0; i < num_dx_leaves; i++) {
bh = sb_getblk(osb->sb, start_blk + i);
if (bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
dx_leaves[i] = bh;
- ocfs2_set_new_buffer_uptodate(dir, bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
- ret = ocfs2_journal_access_dl(handle, dir, bh,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
@@ -2513,11 +2445,10 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
dx_leaf->dl_list.de_count =
cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
- mlog(0,
- "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)bh->b_blocknr,
- le16_to_cpu(dx_leaf->dl_list.de_count));
+ trace_ocfs2_dx_dir_format_cluster(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(dx_leaf->dl_list.de_count));
ocfs2_journal_dirty(handle, bh);
}
@@ -2549,7 +2480,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
* chance of contiguousness as the directory grows in number
* of entries.
*/
- ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+ ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2582,7 +2513,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
{
int ret;
u64 phys_blkno;
- struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
num_dx_leaves, &phys_blkno);
@@ -2591,7 +2521,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
goto out;
}
- ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+ ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
meta_ac);
if (ret)
mlog_errno(ret);
@@ -2762,12 +2692,11 @@ static void ocfs2_dx_dir_index_root_block(struct inode *dir,
ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
- mlog(0,
- "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
- (unsigned long long)dir->i_ino, hinfo.major_hash,
- hinfo.minor_hash,
- le16_to_cpu(dx_root->dr_entries.de_num_used),
- de->name_len, de->name);
+ trace_ocfs2_dx_dir_index_root_block(
+ (unsigned long long)dir->i_ino,
+ hinfo.major_hash, hinfo.minor_hash,
+ de->name_len, de->name,
+ le16_to_cpu(dx_root->dr_entries.de_num_used));
ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
dirent_blk);
@@ -2884,7 +2813,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
bytes = blocks_wanted << sb->s_blocksize_bits;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(dir);
- struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
struct buffer_head *dirdata_bh = NULL;
struct buffer_head *dx_root_bh = NULL;
@@ -2895,11 +2824,13 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
struct ocfs2_extent_tree dx_et;
int did_quota = 0, bytes_allocated = 0;
- ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
alloc = ocfs2_clusters_for_bytes(sb, bytes);
dx_alloc = 0;
+ down_write(&oi->ip_alloc_sem);
+
if (ocfs2_supports_indexed_dirs(osb)) {
credits += ocfs2_add_dir_index_credits(sb);
@@ -2940,8 +2871,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
goto out;
}
- down_write(&oi->ip_alloc_sem);
-
/*
* Prepare for worst case allocation scenario of two separate
* extents in the unindexed tree.
@@ -2953,15 +2882,13 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_sem;
+ goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(osb->sb,
- alloc + dx_alloc))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -2985,7 +2912,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* if we only get one now, that's enough to continue. The rest
* will be claimed after the conversion to extents.
*/
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &oi->ip_la_data_resv;
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -3000,14 +2929,14 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
dirdata_bh = sb_getblk(sb, blkno);
if (!dirdata_bh) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out_commit;
}
- ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
- ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+ ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
@@ -3028,11 +2957,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_init_dir_trailer(dir, dirdata_bh, i);
}
- ret = ocfs2_journal_dirty(handle, dirdata_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, dirdata_bh);
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
/*
@@ -3060,7 +2986,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We let the later dirent insert modify c/mtime - to the user
* the data hasn't changed.
*/
- ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
@@ -3080,12 +3006,13 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
di->i_size = cpu_to_le64(sb->s_blocksize);
di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
* This should never fail as our extent list is empty and all
* related blocks have been journaled already.
*/
- ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+ ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
0, NULL);
if (ret) {
mlog_errno(ret);
@@ -3098,11 +3025,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
*/
dir->i_blocks = ocfs2_inode_sector_count(dir);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, di_bh);
if (ocfs2_supports_indexed_dirs(osb)) {
ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3117,8 +3040,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
dirdata_bh);
} else {
- ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
- ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+ ocfs2_init_dx_root_extent_tree(&dx_et,
+ INODE_CACHE(dir),
+ dx_root_bh);
+ ret = ocfs2_insert_extent(handle, &dx_et, 0,
dx_insert_blkno, 1, 0, NULL);
if (ret)
mlog_errno(ret);
@@ -3130,7 +3055,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* pass. Claim the 2nd cluster as a separate extent.
*/
if (alloc > len) {
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&len);
if (ret) {
mlog_errno(ret);
@@ -3138,7 +3063,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
}
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
- ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+ ret = ocfs2_insert_extent(handle, &et, 1,
blkno, len, 0, NULL);
if (ret) {
mlog_errno(ret);
@@ -3168,14 +3093,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir, bytes_allocated);
+ dquot_free_space_nodirty(dir, bytes_allocated);
ocfs2_commit_trans(osb, handle);
-out_sem:
- up_write(&oi->ip_alloc_sem);
-
out:
+ up_write(&oi->ip_alloc_sem);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
if (meta_ac)
@@ -3213,11 +3136,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(sb, 1))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(sb, 1));
+ if (status)
goto bail;
- }
did_quota = 1;
status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
@@ -3239,15 +3161,14 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
*new_bh = sb_getblk(sb, p_blkno);
if (!*new_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
status = 0;
bail:
if (did_quota && status < 0)
- vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
- mlog_exit(status);
+ dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
return status;
}
@@ -3282,8 +3203,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
struct ocfs2_extent_tree et;
struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
- mlog_entry_void();
-
if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
/*
* This would be a code error as an inline directory should
@@ -3322,21 +3241,26 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
brelse(new_bh);
new_bh = NULL;
+ down_write(&OCFS2_I(dir)->ip_alloc_sem);
+ drop_alloc_sem = 1;
dir_i_size = i_size_read(dir);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
goto do_extend;
}
+ down_write(&OCFS2_I(dir)->ip_alloc_sem);
+ drop_alloc_sem = 1;
dir_i_size = i_size_read(dir);
- mlog(0, "extending dir %llu (i_size = %lld)\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+ trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ dir_i_size);
/* dir->i_size is always block aligned. */
spin_lock(&OCFS2_I(dir)->ip_lock);
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
spin_unlock(&OCFS2_I(dir)->ip_lock);
- ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
- num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
+ parent_fe_bh);
+ num_free_extents = ocfs2_num_free_extents(osb, &et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
@@ -3359,7 +3283,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
- credits = ocfs2_calc_extend_credits(sb, el, 1);
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
+ credits = ocfs2_calc_extend_credits(sb, el);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -3370,9 +3297,6 @@ do_extend:
credits++; /* For attaching the new dirent block to the
* dx_root */
- down_write(&OCFS2_I(dir)->ip_alloc_sem);
- drop_alloc_sem = 1;
-
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -3388,9 +3312,9 @@ do_extend:
goto bail;
}
- ocfs2_set_new_buffer_uptodate(dir, new_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
- status = ocfs2_journal_access_db(handle, dir, new_bh,
+ status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -3416,11 +3340,8 @@ do_extend:
} else {
de->rec_len = cpu_to_le16(sb->s_blocksize);
}
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, new_bh);
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
@@ -3435,10 +3356,10 @@ bail_bh:
*new_de_bh = new_bh;
get_bh(*new_de_bh);
bail:
- if (drop_alloc_sem)
- up_write(&OCFS2_I(dir)->ip_alloc_sem);
if (handle)
ocfs2_commit_trans(osb, handle);
+ if (drop_alloc_sem)
+ up_write(&OCFS2_I(dir)->ip_alloc_sem);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
@@ -3447,7 +3368,6 @@ bail:
brelse(new_bh);
- mlog_exit(status);
return status;
}
@@ -3594,8 +3514,9 @@ next:
status = 0;
bail:
brelse(bh);
+ if (status)
+ mlog_errno(status);
- mlog_exit(status);
return status;
}
@@ -3798,7 +3719,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
{
int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
- credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+ credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
credits += ocfs2_quota_trans_credits(osb->sb);
return credits;
}
@@ -3826,11 +3747,11 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
- mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)leaf_blkno, insert_hash);
+ trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)leaf_blkno,
+ insert_hash);
- ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
/*
@@ -3879,14 +3800,13 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
- ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -3900,11 +3820,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
dx_leaf_sort_swap);
- ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, dx_leaf_bh);
ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
&split_hash);
@@ -3913,8 +3829,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out_commit;
}
- mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
- leaf_cpos, split_hash, insert_hash);
+ trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
/*
* We have to carefully order operations here. There are items
@@ -3949,15 +3864,6 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out_commit;
}
- for (i = 0; i < num_dx_leaves; i++) {
- ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
- }
-
cpos = split_hash;
ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
data_ac, meta_ac, new_dx_leaves,
@@ -3967,14 +3873,33 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out_commit;
}
+ for (i = 0; i < num_dx_leaves; i++) {
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ orig_dx_leaves[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ new_dx_leaves[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
orig_dx_leaves, new_dx_leaves, num_dx_leaves);
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir,
+ dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_commit_trans(osb, handle);
out:
@@ -4154,11 +4079,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
goto out;
}
- if (vfs_dq_alloc_space_nodirty(dir,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- ret = -EDQUOT;
+ ret = dquot_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (ret)
goto out_commit;
- }
did_quota = 1;
/*
@@ -4166,7 +4090,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
* failure to add the dx_root_bh to the journal won't result
* us losing clusters.
*/
- ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+ ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -4208,18 +4132,18 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
/* This should never fail considering we start with an empty
* dx_root. */
- ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
- ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
- insert_blkno, 1, 0, NULL);
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
+ ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
if (ret)
mlog_errno(ret);
did_quota = 0;
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, dx_root_bh);
out_commit:
if (ret < 0 && did_quota)
- vfs_dq_free_space_nodirty(dir,
+ dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
ocfs2_commit_trans(osb, handle);
@@ -4364,8 +4288,8 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
unsigned int blocks_wanted = 1;
struct buffer_head *bh = NULL;
- mlog(0, "getting ready to insert namelen %d into dir %llu\n",
- namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+ trace_ocfs2_prepare_dir_for_insert(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
if (!namelen) {
ret = -EINVAL;
@@ -4470,22 +4394,28 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
goto out_unlock;
}
- ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
+ spin_lock(&OCFS2_I(dir)->ip_lock);
OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
di->i_dx_root = cpu_to_le64(0ULL);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, di_bh);
blk = le64_to_cpu(dx_root->dr_blkno);
bit = le16_to_cpu(dx_root->dr_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (dx_root->dr_suballoc_loc)
+ bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
bit, bg_blkno, 1);
if (ret)
@@ -4533,7 +4463,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
if (ocfs2_dx_root_inline(dx_root))
goto remove_index;
- ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+ ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
/* XXX: What if dr_clusters is too large? */
while (le32_to_cpu(dx_root->dr_clusters)) {
@@ -4546,8 +4476,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
- ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
- &dealloc);
+ ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+ &dealloc, 0);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4566,7 +4496,7 @@ remove_index:
goto out;
}
- ocfs2_remove_from_cache(dir, dx_root_bh);
+ ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
out:
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb64..f0344b75b14 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
struct ocfs2_dir_lookup_result *res);
int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 19036137570..bd1aab1f49a 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
- dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index b5786a787fa..3cfa114aa39 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -95,7 +95,7 @@ const char *dlm_errname(enum dlm_status err);
mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
} while (0)
-#define DLM_LKSB_UNUSED1 0x01
+#define DLM_LKSB_UNUSED1 0x01
#define DLM_LKSB_PUT_LVB 0x02
#define DLM_LKSB_GET_LVB 0x04
#define DLM_LKSB_UNUSED2 0x08
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b28..b46278f9ae4 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -90,22 +88,31 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
return 0;
}
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+ res = lock->lockres;
+
assert_spin_locked(&dlm->ast_lock);
+
if (!list_empty(&lock->ast_list)) {
- mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+ "AST list not empty, pending %d, newlevel %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ast_pending, lock->ml.type);
BUG();
}
- BUG_ON(!list_empty(&lock->ast_list));
if (lock->ast_pending)
- mlog(0, "lock has an ast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -113,9 +120,10 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
/* check to see if this ast obsoletes the bast */
if (dlm_should_cancel_bast(dlm, lock)) {
- struct dlm_lock_resource *res = lock->lockres;
- mlog(0, "%s: cancelling bast for %.*s\n",
- dlm->name, res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lock->bast_pending = 0;
list_del_init(&lock->bast_list);
lock->ml.highest_blocked = LKM_IVMODE;
@@ -125,7 +133,7 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
dlm_lock_put(lock);
/* free up the reserved bast that we are cancelling.
* guaranteed that this will not be the last reserved
- * ast because *both* an ast and a bast were reserved
+ * ast because *both* an ast and a bast were reserved
* to get to this point. the res->spinlock will not be
* taken here */
dlm_lockres_release_ast(dlm, res);
@@ -137,8 +145,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -148,17 +154,23 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
}
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+
assert_spin_locked(&dlm->ast_lock);
+ res = lock->lockres;
+
BUG_ON(!list_empty(&lock->bast_list));
if (lock->bast_pending)
- mlog(0, "lock has a bast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -170,8 +182,6 @@ static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -187,9 +197,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
BUG_ON(!lksb);
/* only updates if this node masters the lockres */
+ spin_lock(&res->spinlock);
if (res->owner == dlm->node_num) {
-
- spin_lock(&res->spinlock);
/* check the lksb flags for the direction */
if (lksb->flags & DLM_LKSB_GET_LVB) {
mlog(0, "getting lvb from lockres for %s node\n",
@@ -204,8 +213,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* here. In the future we might want to clear it at the time
* the put is actually done.
*/
- spin_unlock(&res->spinlock);
}
+ spin_unlock(&res->spinlock);
/* reset any lvb flags on the lksb */
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -217,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
dlm_astlockfunc_t *fn;
struct dlm_lockstatus *lksb;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
fn = lock->ast;
@@ -235,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lockstatus *lksb;
int lksbflags;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
BUG_ON(lock->ml.node == dlm->node_num);
@@ -254,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
{
dlm_bastlockfunc_t *fn = lock->bast;
- mlog_entry_void();
BUG_ON(lock->ml.node != dlm->node_num);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ blocked_type);
+
(*fn)(lock->astdata, blocked_type);
}
@@ -272,8 +292,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_lock *lock = NULL;
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
- struct list_head *iter, *head=NULL;
- u64 cookie;
+ struct list_head *head = NULL;
+ __be64 cookie;
u32 flags;
u8 node;
@@ -336,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
/* cannot get a proxy ast message if this node owns it */
BUG_ON(res->owner == dlm->node_num);
- mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -352,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
/* try convert queue for both ast/bast */
head = &res->converting;
lock = NULL;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -364,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
else
head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -386,8 +405,12 @@ do_ast:
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
- mlog(0, "ast: Adding to granted list... type=%d, "
- "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ lock->ml.type, lock->ml.convert_type);
+
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
@@ -430,9 +453,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
size_t veclen = 1;
int status;
- mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
- res->lockname.len, res->lockname.name, lock->ml.node,
- msg_type, blocked_type);
+ mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+ blocked_type);
memset(&past, 0, sizeof(struct dlm_proxy_ast));
past.node_idx = dlm->node_num;
@@ -445,7 +468,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
vec[0].iov_base = &past;
if (flags & DLM_LKSB_GET_LVB) {
- mlog(0, "returning requested LVB data\n");
be32_add_cpu(&past.flags, LKM_GET_LVB);
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
@@ -455,7 +477,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name, ret,
+ lock->ml.node);
else {
if (status == DLM_RECOVERING) {
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980..fae17c640df 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
#define DLM_THREAD_MS 200 // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
# define DLM_HASH_PAGES 1
#else
@@ -50,10 +50,10 @@
#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
enum dlm_mle_type {
- DLM_MLE_BLOCK,
- DLM_MLE_MASTER,
- DLM_MLE_MIGRATION,
- DLM_MLE_NUM_TYPES
+ DLM_MLE_BLOCK = 0,
+ DLM_MLE_MASTER = 1,
+ DLM_MLE_MIGRATION = 2,
+ DLM_MLE_NUM_TYPES = 3,
};
struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
enum dlm_ast_type {
DLM_AST = 0,
- DLM_BAST,
- DLM_ASTUNLOCK
+ DLM_BAST = 1,
+ DLM_ASTUNLOCK = 2,
};
@@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
struct dlm_recovery_ctxt
{
struct list_head resources;
- struct list_head received;
struct list_head node_data;
u8 new_master;
u8 dead_node;
@@ -119,9 +118,9 @@ struct dlm_recovery_ctxt
enum dlm_ctxt_state {
DLM_CTXT_NEW = 0,
- DLM_CTXT_JOINED,
- DLM_CTXT_IN_SHUTDOWN,
- DLM_CTXT_LEAVING,
+ DLM_CTXT_JOINED = 1,
+ DLM_CTXT_IN_SHUTDOWN = 2,
+ DLM_CTXT_LEAVING = 3,
};
struct dlm_ctxt
@@ -144,6 +143,7 @@ struct dlm_ctxt
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct dlm_recovery_ctxt reco;
spinlock_t master_lock;
@@ -331,6 +331,7 @@ struct dlm_lock_resource
u16 state;
char lvb[DLM_LVB_LEN];
unsigned int inflight_locks;
+ unsigned int inflight_assert_workers;
unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
@@ -388,8 +389,8 @@ struct dlm_lock
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
- DLM_CONVERTING_LIST,
- DLM_BLOCKED_LIST
+ DLM_CONVERTING_LIST = 1,
+ DLM_BLOCKED_LIST = 2,
};
static inline int dlm_lvb_is_empty(char *lvb)
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
return 1;
}
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+ if (idx == DLM_GRANTED_LIST)
+ return "granted";
+ else if (idx == DLM_CONVERTING_LIST)
+ return "converting";
+ else if (idx == DLM_BLOCKED_LIST)
+ return "blocked";
+ else
+ return "unknown";
+}
+
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
@@ -427,25 +440,28 @@ struct dlm_node_iter
enum {
- DLM_MASTER_REQUEST_MSG = 500,
- DLM_UNUSED_MSG1, /* 501 */
- DLM_ASSERT_MASTER_MSG, /* 502 */
- DLM_CREATE_LOCK_MSG, /* 503 */
- DLM_CONVERT_LOCK_MSG, /* 504 */
- DLM_PROXY_AST_MSG, /* 505 */
- DLM_UNLOCK_LOCK_MSG, /* 506 */
- DLM_DEREF_LOCKRES_MSG, /* 507 */
- DLM_MIGRATE_REQUEST_MSG, /* 508 */
- DLM_MIG_LOCKRES_MSG, /* 509 */
- DLM_QUERY_JOIN_MSG, /* 510 */
- DLM_ASSERT_JOINED_MSG, /* 511 */
- DLM_CANCEL_JOIN_MSG, /* 512 */
- DLM_EXIT_DOMAIN_MSG, /* 513 */
- DLM_MASTER_REQUERY_MSG, /* 514 */
- DLM_LOCK_REQUEST_MSG, /* 515 */
- DLM_RECO_DATA_DONE_MSG, /* 516 */
- DLM_BEGIN_RECO_MSG, /* 517 */
- DLM_FINALIZE_RECO_MSG /* 518 */
+ DLM_MASTER_REQUEST_MSG = 500,
+ DLM_UNUSED_MSG1 = 501,
+ DLM_ASSERT_MASTER_MSG = 502,
+ DLM_CREATE_LOCK_MSG = 503,
+ DLM_CONVERT_LOCK_MSG = 504,
+ DLM_PROXY_AST_MSG = 505,
+ DLM_UNLOCK_LOCK_MSG = 506,
+ DLM_DEREF_LOCKRES_MSG = 507,
+ DLM_MIGRATE_REQUEST_MSG = 508,
+ DLM_MIG_LOCKRES_MSG = 509,
+ DLM_QUERY_JOIN_MSG = 510,
+ DLM_ASSERT_JOINED_MSG = 511,
+ DLM_CANCEL_JOIN_MSG = 512,
+ DLM_EXIT_DOMAIN_MSG = 513,
+ DLM_MASTER_REQUERY_MSG = 514,
+ DLM_LOCK_REQUEST_MSG = 515,
+ DLM_RECO_DATA_DONE_MSG = 516,
+ DLM_BEGIN_RECO_MSG = 517,
+ DLM_FINALIZE_RECO_MSG = 518,
+ DLM_QUERY_REGION = 519,
+ DLM_QUERY_NODEINFO = 520,
+ DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
};
struct dlm_reco_node_data
@@ -458,19 +474,19 @@ struct dlm_reco_node_data
enum {
DLM_RECO_NODE_DATA_DEAD = -1,
DLM_RECO_NODE_DATA_INIT = 0,
- DLM_RECO_NODE_DATA_REQUESTING,
- DLM_RECO_NODE_DATA_REQUESTED,
- DLM_RECO_NODE_DATA_RECEIVING,
- DLM_RECO_NODE_DATA_DONE,
- DLM_RECO_NODE_DATA_FINALIZE_SENT,
+ DLM_RECO_NODE_DATA_REQUESTING = 1,
+ DLM_RECO_NODE_DATA_REQUESTED = 2,
+ DLM_RECO_NODE_DATA_RECEIVING = 3,
+ DLM_RECO_NODE_DATA_DONE = 4,
+ DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
};
enum {
DLM_MASTER_RESP_NO = 0,
- DLM_MASTER_RESP_YES,
- DLM_MASTER_RESP_MAYBE,
- DLM_MASTER_RESP_ERROR
+ DLM_MASTER_RESP_YES = 1,
+ DLM_MASTER_RESP_MAYBE = 2,
+ DLM_MASTER_RESP_ERROR = 3,
};
@@ -647,9 +663,9 @@ struct dlm_proxy_ast
#define DLM_MOD_KEY (0x666c6172)
enum dlm_query_join_response_code {
JOIN_DISALLOW = 0,
- JOIN_OK,
- JOIN_OK_NO_MAP,
- JOIN_PROTOCOL_MISMATCH,
+ JOIN_OK = 1,
+ JOIN_OK_NO_MAP = 2,
+ JOIN_PROTOCOL_MISMATCH = 3,
};
struct dlm_query_join_packet {
@@ -663,7 +679,7 @@ struct dlm_query_join_packet {
};
union dlm_query_join_response {
- u32 intval;
+ __be32 intval;
struct dlm_query_join_packet packet;
};
@@ -727,6 +743,31 @@ struct dlm_cancel_join
u8 domain[O2NM_MAX_NAME_LEN];
};
+struct dlm_query_region {
+ u8 qr_node;
+ u8 qr_numregions;
+ u8 qr_namelen;
+ u8 pad1;
+ u8 qr_domain[O2NM_MAX_NAME_LEN];
+ u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+ u8 ni_nodenum;
+ u8 pad1;
+ __be16 ni_ipv4_port;
+ __be32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+ u8 qn_nodenum;
+ u8 qn_numnodes;
+ u8 qn_namelen;
+ u8 pad1;
+ u8 qn_domain[O2NM_MAX_NAME_LEN];
+ struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
struct dlm_exit_domain
{
u8 node_idx;
@@ -818,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -836,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
kref_get(&res->refs);
}
void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
@@ -861,49 +901,23 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen);
-#define dlm_lockres_set_refmap_bit(bit,res) \
- __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
-#define dlm_lockres_clear_refmap_bit(bit,res) \
- __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
-static inline void __dlm_lockres_set_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: setting bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- set_bit(bit, res->refmap);
-}
-
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- clear_bit(bit, res->refmap);
-}
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line);
-#define dlm_lockres_drop_inflight_ref(d,r) \
- __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock);
@@ -1028,6 +1042,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_clean_master_list(struct dlm_ctxt *dlm,
u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
int __dlm_lockres_unused(struct dlm_lock_resource *res);
@@ -1067,11 +1082,9 @@ static inline int dlm_lock_compatible(int existing, int request)
static inline int dlm_lock_on_list(struct list_head *head,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, head) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, head, list) {
if (tmplock == lock)
return 1;
}
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf..e36d63ff178 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -125,13 +123,12 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
int *kick_thread)
{
enum dlm_status status = DLM_NORMAL;
- struct list_head *iter;
struct dlm_lock *tmplock=NULL;
assert_spin_locked(&res->spinlock);
- mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
- lock->ml.type, lock->ml.convert_type, type);
+ mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+ lock->ml.type, lock->ml.convert_type, type);
spin_lock(&lock->spinlock);
@@ -187,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
/* upconvert from here on */
status = DLM_NORMAL;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->granted, list) {
if (tmplock == lock)
continue;
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
/* existing conversion requests take precedence */
@@ -355,7 +350,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct kvec vec[2];
size_t veclen = 1;
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
memset(&convert, 0, sizeof(struct dlm_convert_lock));
convert.node_idx = dlm->node_num;
@@ -392,12 +387,14 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
dlm_error(ret);
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+ res->owner);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
* to notice the node is dead. times out after 5s. */
- dlm_wait_for_node_death(dlm, res->owner,
+ dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -424,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
+ struct dlm_lock *tmp_lock;
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
@@ -471,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_error(status);
goto leave;
}
- list_for_each(iter, &res->granted) {
- lock = list_entry(iter, struct dlm_lock, list);
- if (lock->ml.cookie == cnv->cookie &&
- lock->ml.node == cnv->node_idx) {
+ list_for_each_entry(tmp_lock, &res->granted, list) {
+ if (tmp_lock->ml.cookie == cnv->cookie &&
+ tmp_lock->ml.node == cnv->node_idx) {
+ lock = tmp_lock;
dlm_lock_get(lock);
break;
}
- lock = NULL;
}
spin_unlock(&res->spinlock);
if (!lock) {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index df52f706f66..18f13c2e4a1 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,10 +27,10 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
+#include <linux/export.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
@@ -96,14 +96,13 @@ static void __dlm_print_lock(struct dlm_lock *lock)
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
- struct list_head *iter2;
struct dlm_lock *lock;
char buf[DLM_LOCKID_NAME_MAX];
assert_spin_locked(&res->spinlock);
stringify_lockname(res->lockname.name, res->lockname.len,
- buf, sizeof(buf) - 1);
+ buf, sizeof(buf));
printk("lockres: %s, owner=%u, state=%u\n",
buf, res->owner, res->state);
printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
@@ -118,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
res->inflight_locks, atomic_read(&res->asts_reserved));
dlm_print_lockres_refmap(res);
printk(" granted queue:\n");
- list_for_each(iter2, &res->granted) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
__dlm_print_lock(lock);
}
printk(" converting queue:\n");
- list_for_each(iter2, &res->converting) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
__dlm_print_lock(lock);
}
printk(" blocked queue:\n");
- list_for_each(iter2, &res->blocked) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->blocked, list) {
__dlm_print_lock(lock);
}
}
@@ -342,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
#ifdef CONFIG_DEBUG_FS
-static struct dentry *dlm_debugfs_root = NULL;
+static struct dentry *dlm_debugfs_root;
#define DLM_DEBUGFS_DIR "o2dlm"
#define DLM_DEBUGFS_DLM_STATE "dlm_state"
@@ -371,92 +367,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
kref_get(&dc->debug_refcnt);
}
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
{
- struct debug_buffer *db = NULL;
-
- db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
- if (!db)
- goto bail;
-
- db->len = PAGE_SIZE;
- db->buf = kmalloc(db->len, GFP_KERNEL);
- if (!db->buf)
- goto bail;
-
- return db;
-bail:
- kfree(db);
- return NULL;
-}
-
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct debug_buffer *db = file->private_data;
-
- return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
- struct debug_buffer *db = file->private_data;
- loff_t new = -1;
-
- switch (whence) {
- case 0:
- new = off;
- break;
- case 1:
- new = file->f_pos + off;
- break;
- }
-
- if (new < 0 || new > db->len)
- return -EINVAL;
-
- return (file->f_pos = new);
+ free_page((unsigned long)file->private_data);
+ return 0;
}
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- struct debug_buffer *db = (struct debug_buffer *)file->private_data;
-
- if (db)
- kfree(db->buf);
- kfree(db);
-
- return 0;
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
}
/* end - util funcs */
/* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_lock_resource *res;
int out = 0;
unsigned long total = 0;
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dumping Purgelist for Domain: %s\n", dlm->name);
spin_lock(&dlm->spinlock);
list_for_each_entry(res, &dlm->purge_list, purge) {
++total;
- if (db->len - out < 100)
+ if (len - out < 100)
continue;
spin_lock(&res->spinlock);
out += stringify_lockname(res->lockname.name,
res->lockname.len,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\t%ld\n",
(jiffies - res->last_used)/HZ);
spin_unlock(&res->spinlock);
}
spin_unlock(&dlm->spinlock);
- out += snprintf(db->buf + out, db->len - out,
- "Total on list: %ld\n", total);
+ out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
return out;
}
@@ -464,59 +414,56 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_purgelist_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_purgelist_print(dlm, db);
+ i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
return -ENOMEM;
}
-static struct file_operations debug_purgelist_fops = {
+static const struct file_operations debug_purgelist_fops = {
.open = debug_purgelist_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - purge list funcs */
/* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_master_list_entry *mle;
struct hlist_head *bucket;
- struct hlist_node *list;
int i, out = 0;
- unsigned long total = 0, longest = 0, bktcnt;
+ unsigned long total = 0, longest = 0, bucket_count = 0;
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each(list, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry(mle, bucket, master_hash_node) {
++total;
- ++bktcnt;
- if (db->len - out < 200)
+ ++bucket_count;
+ if (len - out < 200)
continue;
- out += dump_mle(mle, db->buf + out, db->len - out);
+ out += dump_mle(mle, buf + out, len - out);
}
- longest = max(longest, bktcnt);
- bktcnt = 0;
+ longest = max(longest, bucket_count);
+ bucket_count = 0;
}
spin_unlock(&dlm->master_lock);
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Total: %ld, Longest: %ld\n", total, longest);
return out;
}
@@ -524,26 +471,26 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_mle_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_mle_print(dlm, db);
+ i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
return -ENOMEM;
}
-static struct file_operations debug_mle_fops = {
+static const struct file_operations debug_mle_fops = {
.open = debug_mle_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - debug mle funcs */
@@ -637,8 +584,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
spin_lock(&dlm->track_lock);
if (oldres)
track_list = &oldres->tracking;
- else
+ else {
track_list = &dlm->tracking_list;
+ if (list_empty(track_list)) {
+ dl = NULL;
+ spin_unlock(&dlm->track_lock);
+ goto bail;
+ }
+ }
list_for_each_entry(res, track_list, tracking) {
if (&res->tracking == &dlm->tracking_list)
@@ -661,6 +614,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
} else
dl = NULL;
+bail:
/* passed to seq_show */
return dl;
}
@@ -683,7 +637,7 @@ static int lockres_seq_show(struct seq_file *s, void *v)
return 0;
}
-static struct seq_operations debug_lockres_ops = {
+static const struct seq_operations debug_lockres_ops = {
.start = lockres_seq_start,
.stop = lockres_seq_stop,
.next = lockres_seq_next,
@@ -716,7 +670,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
goto bail;
}
- seq = (struct seq_file *) file->private_data;
+ seq = file->private_data;
seq->private = dl;
dlm_grab(dlm);
@@ -732,7 +686,7 @@ bail:
static int debug_lockres_release(struct inode *inode, struct file *file)
{
- struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct seq_file *seq = file->private_data;
struct debug_lockres *dl = (struct debug_lockres *)seq->private;
if (dl->dl_res)
@@ -742,7 +696,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
return seq_release_private(inode, file);
}
-static struct file_operations debug_lockres_fops = {
+static const struct file_operations debug_lockres_fops = {
.open = debug_lockres_open,
.release = debug_lockres_release,
.read = seq_read,
@@ -751,7 +705,7 @@ static struct file_operations debug_lockres_fops = {
/* end - debug lockres funcs */
/* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
{
int out = 0;
struct dlm_reco_node_data *node;
@@ -775,33 +729,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
}
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
- out += snprintf(db->buf + out, db->len - out,
- "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
+ out += snprintf(buf + out, len - out,
+ "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
+ dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Thread Pid: %d Node: %d State: %s\n",
- dlm->dlm_thread_task->pid, dlm->node_num, state);
+ task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
/* Number of Joins: xxx Joining Node: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Number of Joins: %d Joining Node: %d\n",
dlm->num_joins, dlm->joining_node);
/* Domain Map: xx xx xx */
- out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+ out += snprintf(buf + out, len - out, "Domain Map: ");
out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* Exit Domain Map: xx xx xx */
+ out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+ out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
- out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+ out += snprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Lock Resources: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Lock Resources: %d (%d)\n",
atomic_read(&dlm->res_cur_count),
atomic_read(&dlm->res_tot_count));
@@ -813,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
cur_mles += atomic_read(&dlm->mle_cur_count[i]);
/* MLEs: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"MLEs: %d (%d)\n", cur_mles, tot_mles);
/* Blocking: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Blocking: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
/* Mastery: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Mastery: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
/* Migration: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Migration: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
/* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Lists: Dirty=%s Purge=%s PendingASTs=%s "
"PendingBASTs=%s\n",
(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -844,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
/* Purge Count: xxx Refs: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Purge Count: %d Refs: %d\n", dlm->purge_count,
atomic_read(&dlm->dlm_refs.refcount));
/* Dead Node: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dead Node: %d\n", dlm->reco.dead_node);
/* What about DLM_RECO_STATE_FINALIZE? */
@@ -859,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
state = "INACTIVE";
/* Recovery Pid: xxxx Master: xxx State: xxxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Recovery Pid: %d Master: %d State: %s\n",
- dlm->dlm_reco_thread_task->pid,
+ task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.new_master, state);
/* Recovery Map: xx xx */
- out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+ out += snprintf(buf + out, len - out, "Recovery Map: ");
out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Recovery Node State: */
- out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+ out += snprintf(buf + out, len - out, "Recovery Node State:\n");
list_for_each_entry(node, &dlm->reco.node_data, list) {
switch (node->state) {
case DLM_RECO_NODE_DATA_INIT:
@@ -899,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
state = "BAD";
break;
}
- out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+ out += snprintf(buf + out, len - out, "\t%u - %s\n",
node->node_num, state);
}
@@ -911,26 +873,26 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_state_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db = NULL;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_state_print(dlm, db);
+ i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
return -ENOMEM;
}
-static struct file_operations debug_state_fops = {
+static const struct file_operations debug_state_fops = {
.open = debug_state_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - debug state funcs */
@@ -994,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
if (dc) {
- if (dc->debug_purgelist_dentry)
- debugfs_remove(dc->debug_purgelist_dentry);
- if (dc->debug_mle_dentry)
- debugfs_remove(dc->debug_mle_dentry);
- if (dc->debug_lockres_dentry)
- debugfs_remove(dc->debug_lockres_dentry);
- if (dc->debug_state_dentry)
- debugfs_remove(dc->debug_state_dentry);
+ debugfs_remove(dc->debug_purgelist_dentry);
+ debugfs_remove(dc->debug_mle_dentry);
+ debugfs_remove(dc->debug_lockres_dentry);
+ debugfs_remove(dc->debug_state_dentry);
dlm_debug_put(dc);
}
}
@@ -1032,8 +990,7 @@ bail:
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
- if (dlm->dlm_debugfs_subroot)
- debugfs_remove(dlm->dlm_debugfs_subroot);
+ debugfs_remove(dlm->dlm_debugfs_subroot);
}
/* debugfs root */
@@ -1049,7 +1006,6 @@ int dlm_create_debugfs_root(void)
void dlm_destroy_debugfs_root(void)
{
- if (dlm_debugfs_root)
- debugfs_remove(dlm_debugfs_root);
+ debugfs_remove(dlm_debugfs_root);
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c..1f27c4812d1 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
struct dentry *debug_purgelist_dentry;
};
-struct debug_buffer {
- int len;
- char *buf;
-};
-
struct debug_lockres {
int dl_len;
char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd..39efc5057a3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
@@ -44,8 +43,6 @@
#include "dlmdomain.h"
#include "dlmdebug.h"
-#include "dlmver.h"
-
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
@@ -129,10 +126,16 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* will have a negotiated version with the same major number and a minor
* number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
* be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ * - Message DLM_QUERY_REGION added to support global heartbeat
+ * - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 0,
+ .pv_minor = 2,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -143,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data);
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -150,16 +155,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- if (!hlist_unhashed(&lockres->hash_node)) {
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
- }
+ if (hlist_unhashed(&res->hash_node))
+ return;
+
+ mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ hlist_del_init(&res->hash_node);
+ dlm_lockres_put(res);
}
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
struct qstr *q;
@@ -173,6 +180,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
+
+ mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -181,17 +191,15 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
unsigned int hash)
{
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct dlm_lock_resource *res;
- mlog_entry("%.*s\n", len, name);
+ mlog(0, "%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
bucket = dlm_lockres_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- struct dlm_lock_resource *res = hlist_entry(list,
- struct dlm_lock_resource, hash_node);
+ hlist_for_each_entry(res, bucket, hash_node) {
if (res->lockname.name[0] != name[0])
continue;
if (unlikely(res->lockname.len != len))
@@ -217,7 +225,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
{
struct dlm_lock_resource *res = NULL;
- mlog_entry("%.*s\n", len, name);
+ mlog(0, "%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
@@ -250,22 +258,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
{
- struct dlm_ctxt *tmp = NULL;
- struct list_head *iter;
+ struct dlm_ctxt *tmp;
assert_spin_locked(&dlm_domain_lock);
/* tmp->name here is always NULL terminated,
* but domain may not be! */
- list_for_each(iter, &dlm_domains) {
- tmp = list_entry (iter, struct dlm_ctxt, list);
+ list_for_each_entry(tmp, &dlm_domains, list) {
if (strlen(tmp->name) == len &&
memcmp(tmp->name, domain, len)==0)
- break;
- tmp = NULL;
+ return tmp;
}
- return tmp;
+ return NULL;
}
/* For null terminated domain strings ONLY */
@@ -307,9 +312,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
if (dlm->master_hash)
dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
- if (dlm->name)
- kfree(dlm->name);
-
+ kfree(dlm->name);
kfree(dlm);
}
@@ -356,25 +359,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
* you shouldn't trust your pointer. */
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
{
- struct list_head *iter;
- struct dlm_ctxt *target = NULL;
+ struct dlm_ctxt *target;
+ struct dlm_ctxt *ret = NULL;
spin_lock(&dlm_domain_lock);
- list_for_each(iter, &dlm_domains) {
- target = list_entry (iter, struct dlm_ctxt, list);
-
+ list_for_each_entry(target, &dlm_domains, list) {
if (target == dlm) {
__dlm_get(target);
+ ret = target;
break;
}
-
- target = NULL;
}
spin_unlock(&dlm_domain_lock);
- return target;
+ return ret;
}
int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -444,19 +444,21 @@ redo_bucket:
dropped = dlm_empty_lockres(dlm, res);
spin_lock(&res->spinlock);
- __dlm_lockres_calc_usage(dlm, res);
- iter = res->hash_node.next;
+ if (dropped)
+ __dlm_lockres_calc_usage(dlm, res);
+ else
+ iter = res->hash_node.next;
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
- if (dropped)
+ if (dropped) {
+ cond_resched_lock(&dlm->spinlock);
goto redo_bucket;
+ }
}
cond_resched_lock(&dlm->spinlock);
num += n;
- mlog(0, "%s: touched %d lockreses in bucket %d "
- "(tot=%d)\n", dlm->name, n, i, num);
}
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
@@ -483,6 +485,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
return ret;
}
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ unsigned int node;
+ struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ node = exit_msg->node_idx;
+ mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+ spin_lock(&dlm->spinlock);
+ set_bit(node, dlm->exit_domain_map);
+ spin_unlock(&dlm->spinlock);
+
+ dlm_put(dlm);
+
+ return 0;
+}
+
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -508,17 +532,17 @@ again:
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
- int node = -1;
+ int node = -1, num = 0;
assert_spin_locked(&dlm->spinlock);
- printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
-
+ printk("( ");
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
printk("%d ", node);
+ ++num;
}
- printk("\n");
+ printk(") %u nodes\n", num);
}
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -528,17 +552,17 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
- mlog_entry("%p %u %p", msg, len, data);
+ mlog(0, "%p %u %p", msg, len, data);
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
- printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
-
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
+ clear_bit(node, dlm->exit_domain_map);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@@ -551,27 +575,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
- mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
- node, dlm->name, dlm->node_num);
+ mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+ msg_type, node);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
- status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
- &leave_msg, sizeof(leave_msg), node,
- NULL);
-
- mlog(0, "status return %d from o2net_send_message\n", status);
+ status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+ sizeof(leave_msg), node, NULL);
+ if (status < 0)
+ mlog(ML_ERROR, "Error %d sending domain exit message %u "
+ "to node %u on domain %s\n", status, msg_type, node,
+ dlm->name);
return status;
}
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+ int node = -1;
+
+ /* Support for begin exit domain was added in 1.2 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor < 2)
+ return;
+
+ /*
+ * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+ * informational. Meaning if a node does not receive the message,
+ * so be it.
+ */
+ spin_lock(&dlm->spinlock);
+ while (1) {
+ node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+ if (node >= O2NM_MAX_NODES)
+ break;
+ if (node == dlm->node_num)
+ continue;
+
+ spin_unlock(&dlm->spinlock);
+ dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+ spin_lock(&dlm->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
@@ -597,7 +650,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
clear_node = 1;
- status = dlm_send_one_domain_exit(dlm, node);
+ status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+ node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
@@ -672,6 +726,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
+ dlm_begin_exit_domain(dlm);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
@@ -692,6 +747,8 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
+ dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
dlm_put(dlm);
@@ -749,7 +806,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
union dlm_query_join_response response;
response.packet = *packet;
- *wire = cpu_to_be32(response.intval);
+ *wire = be32_to_cpu(response.intval);
}
static void dlm_query_join_wire_to_packet(u32 wire,
@@ -817,7 +874,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
}
/* Once the dlm ctxt is marked as leaving then we don't want
- * to be put in someone's domain map.
+ * to be put in someone's domain map.
* Also, explicitly disallow joining at certain troublesome
* times (ie. during recovery). */
if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
@@ -902,10 +959,19 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* domain. Set him in the map and clean up our
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
+
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "dlm recovery is ongoing, disallow join\n");
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+ return -EAGAIN;
+ }
+
set_bit(assert->node_idx, dlm->domain_map);
+ clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -919,6 +985,371 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+ struct dlm_query_region *qr,
+ char *local, int locallen)
+{
+ char *remote = qr->qr_regions;
+ char *l, *r;
+ int localnr, i, j, foundit;
+ int status = 0;
+
+ if (!o2hb_global_heartbeat_active()) {
+ if (qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+ "heartbeat enabled but local node %d does not\n",
+ qr->qr_domain, qr->qr_node, dlm->node_num);
+ status = -EINVAL;
+ }
+ goto bail;
+ }
+
+ if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Local node %d has global "
+ "heartbeat enabled but joining node %d does not\n",
+ qr->qr_domain, dlm->node_num, qr->qr_node);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
+ localnr = o2hb_get_all_regions(local, (u8)localnr);
+
+ /* compare local regions with remote */
+ l = local;
+ for (i = 0; i < localnr; ++i) {
+ foundit = 0;
+ r = remote;
+ for (j = 0; j <= qr->qr_numregions; ++j) {
+ if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in local node %d but not in joining node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+ dlm->node_num, qr->qr_node);
+ goto bail;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ /* compare remote with local regions */
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ foundit = 0;
+ l = local;
+ for (j = 0; j < localnr; ++j) {
+ if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in joining node %d but not in local node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+ qr->qr_node, dlm->node_num);
+ goto bail;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+bail:
+ return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_region *qr = NULL;
+ int status, ret = 0, i;
+ char *p;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+ if (!qr) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ qr->qr_node = dlm->node_num;
+ qr->qr_namelen = strlen(dlm->name);
+ memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+ /* if local hb, the numregions will be zero */
+ if (o2hb_global_heartbeat_active())
+ qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+ O2NM_MAX_REGIONS);
+
+ p = qr->qr_regions;
+ for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending regions to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+ sizeof(struct dlm_query_region),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+ ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qr);
+ return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_region *qr;
+ struct dlm_ctxt *dlm = NULL;
+ char *local = NULL;
+ int status = 0;
+
+ qr = (struct dlm_query_region *) msg->buf;
+
+ mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+ qr->qr_domain);
+
+ /* buffer used in dlm_mast_regions() */
+ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+ if (!local)
+ return -ENOMEM;
+
+ status = -EINVAL;
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "before join domain\n", qr->qr_node, qr->qr_domain);
+ goto out_domain_lock;
+ }
+
+ spin_lock(&dlm->spinlock);
+ if (dlm->joining_node != qr->qr_node) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+ dlm->joining_node);
+ goto out_dlm_lock;
+ }
+
+ /* Support for global heartbeat was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but active dlm protocol is %d.%d\n", qr->qr_node,
+ qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto out_dlm_lock;
+ }
+
+ status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
+
+out_dlm_lock:
+ spin_unlock(&dlm->spinlock);
+
+out_domain_lock:
+ spin_unlock(&dlm_domain_lock);
+
+ kfree(local);
+
+ return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+ struct o2nm_node *local;
+ struct dlm_node_info *remote;
+ int i, j;
+ int status = 0;
+
+ for (j = 0; j < qn->qn_numnodes; ++j)
+ mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+ &(qn->qn_nodes[j].ni_ipv4_address),
+ ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+ for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+ local = o2nm_get_node_by_num(i);
+ remote = NULL;
+ for (j = 0; j < qn->qn_numnodes; ++j) {
+ if (qn->qn_nodes[j].ni_nodenum == i) {
+ remote = &(qn->qn_nodes[j]);
+ break;
+ }
+ }
+
+ if (!local && !remote)
+ continue;
+
+ if ((local && !remote) || (!local && remote))
+ status = -EINVAL;
+
+ if (!status &&
+ ((remote->ni_nodenum != local->nd_num) ||
+ (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+ (remote->ni_ipv4_address != local->nd_ipv4_address)))
+ status = -EINVAL;
+
+ if (status) {
+ if (remote && !local)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in joining node %d but not in "
+ "local node %d\n", qn->qn_domain,
+ remote->ni_nodenum,
+ &(remote->ni_ipv4_address),
+ ntohs(remote->ni_ipv4_port),
+ qn->qn_nodenum, dlm->node_num);
+ if (local && !remote)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in local node %d but not in "
+ "joining node %d\n", qn->qn_domain,
+ local->nd_num, &(local->nd_ipv4_address),
+ ntohs(local->nd_ipv4_port),
+ dlm->node_num, qn->qn_nodenum);
+ BUG_ON((!local && !remote));
+ }
+
+ if (local)
+ o2nm_node_put(local);
+ }
+
+ return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_nodeinfo *qn = NULL;
+ struct o2nm_node *node;
+ int ret = 0, status, count, i;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+ if (!qn) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+ node = o2nm_get_node_by_num(i);
+ if (!node)
+ continue;
+ qn->qn_nodes[count].ni_nodenum = node->nd_num;
+ qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+ qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+ mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+ &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+ ++count;
+ o2nm_node_put(node);
+ }
+
+ qn->qn_nodenum = dlm->node_num;
+ qn->qn_numnodes = count;
+ qn->qn_namelen = strlen(dlm->name);
+ memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending nodeinfo to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ qn, sizeof(struct dlm_query_nodeinfo),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qn);
+ return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_nodeinfo *qn;
+ struct dlm_ctxt *dlm = NULL;
+ int locked = 0, status = -EINVAL;
+
+ qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+ mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+ qn->qn_domain);
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+ "join domain\n", qn->qn_nodenum, qn->qn_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qn->qn_nodenum) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+ "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for node query was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+ "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+ qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_nodes(dlm, qn);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
@@ -963,7 +1394,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
&cancel_msg, sizeof(cancel_msg), node,
NULL);
if (status < 0) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
@@ -1030,10 +1463,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
- sizeof(join_msg), node,
- &join_resp);
+ sizeof(join_msg), node, &join_resp);
if (status < 0 && status != -ENOPROTOOPT) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1091,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
+ int ret;
struct dlm_assert_joined assert_msg;
mlog(0, "Sending join assert to node %u\n", node);
@@ -1102,9 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
&assert_msg, sizeof(assert_msg), node,
- NULL);
+ &ret);
if (status < 0)
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+ node);
+ else
+ status = ret;
return status;
}
@@ -1178,7 +1617,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
struct domain_join_ctxt *ctxt;
enum dlm_query_join_response_code response = JOIN_DISALLOW;
- mlog_entry("%p", dlm);
+ mlog(0, "%p", dlm);
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (!ctxt) {
@@ -1234,6 +1673,21 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
+ /* Support for global heartbeat and node info was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major > 1 ||
+ dlm->dlm_locking_proto.pv_minor > 0) {
+ status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
/* Joined state *must* be set before the joining node
@@ -1248,8 +1702,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
bail:
spin_lock(&dlm->spinlock);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- if (!status)
+ if (!status) {
+ printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
__dlm_print_nodes(dlm);
+ }
spin_unlock(&dlm->spinlock);
if (ctxt) {
@@ -1270,8 +1726,8 @@ bail:
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
{
- o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
- o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
}
@@ -1283,13 +1739,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
- status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
- status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
@@ -1399,6 +1855,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
if (status)
goto bail;
+ status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+ sizeof(struct dlm_exit_domain),
+ dlm_begin_exit_domain_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
@@ -1422,19 +1885,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- status = dlm_debug_init(dlm);
+ status = dlm_launch_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = dlm_launch_thread(dlm);
+ status = dlm_launch_recovery_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = dlm_launch_recovery_thread(dlm);
+ status = dlm_debug_init(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1517,7 +1980,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
goto leave;
}
- dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+ dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
mlog_errno(-ENOMEM);
kfree(dlm);
@@ -1551,7 +2014,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
for (i = 0; i < DLM_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
- strcpy(dlm->name, domain);
dlm->key = key;
dlm->node_num = o2nm_this_node();
@@ -1572,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
- INIT_LIST_HEAD(&dlm->reco.received);
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
@@ -1666,19 +2127,12 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
struct dlm_ctxt *dlm = NULL;
struct dlm_ctxt *new_ctxt = NULL;
- if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+ if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
ret = -ENAMETOOLONG;
mlog(ML_ERROR, "domain name length too long\n");
goto leave;
}
- if (!o2hb_check_local_node_heartbeating()) {
- mlog(ML_ERROR, "the local node has not been configured, or is "
- "not heartbeating\n");
- ret = -EPROTO;
- goto leave;
- }
-
mlog(0, "register called for domain \"%s\"\n", domain);
retry:
@@ -1704,6 +2158,7 @@ retry:
}
if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+ spin_unlock(&dlm_domain_lock);
mlog(ML_ERROR,
"Requested locking protocol version is not "
"compatible with already registered domain "
@@ -1800,7 +2255,21 @@ static int dlm_register_net_handlers(void)
sizeof(struct dlm_cancel_join),
dlm_cancel_join_handler,
NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+ sizeof(struct dlm_query_region),
+ dlm_query_region_handler,
+ NULL, NULL, &dlm_join_handlers);
+
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ sizeof(struct dlm_query_nodeinfo),
+ dlm_query_nodeinfo_handler,
+ NULL, NULL, &dlm_join_handlers);
bail:
if (status < 0)
dlm_unregister_net_handlers();
@@ -1824,13 +2293,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num)
{
- struct list_head *iter;
struct dlm_eviction_cb *cb;
down_read(&dlm_callback_sem);
- list_for_each(iter, &dlm->dlm_eviction_callbacks) {
- cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
-
+ list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
cb->ec_func(node_num, cb->ec_data);
}
up_read(&dlm_callback_sem);
@@ -1867,8 +2333,6 @@ static int __init dlm_init(void)
{
int status;
- dlm_print_version();
-
status = dlm_init_mle_cache();
if (status) {
mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -1918,6 +2382,7 @@ static void __exit dlm_exit (void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
module_init(dlm_init);
module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
deleted file mode 100644
index a733b3321f8..00000000000
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
deleted file mode 100644
index f35eadbed25..00000000000
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac..66c2a491f68 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -53,7 +52,7 @@
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
-static struct kmem_cache *dlm_lock_cache = NULL;
+static struct kmem_cache *dlm_lock_cache;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
@@ -92,21 +91,19 @@ void dlm_destroy_lock_cache(void)
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->granted, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
+ if (!dlm_lock_compatible(tmplock->ml.convert_type,
+ lock->ml.type))
+ return 0;
}
return 1;
@@ -126,7 +123,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
int call_ast = 0, kick_thread = 0;
enum dlm_status status = DLM_NORMAL;
- mlog_entry("type=%d\n", lock->ml.type);
+ mlog(0, "type=%d\n", lock->ml.type);
spin_lock(&res->spinlock);
/* if called from dlm_create_lock_handler, need to
@@ -176,15 +173,12 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
lock->ml.node);
}
} else {
+ status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
}
}
- /* reduce the inflight count, this may result in the lockres
- * being purged below during calc_usage */
- if (lock->ml.node == dlm->node_num)
- dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -225,14 +219,20 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
enum dlm_status status = DLM_DENIED;
int lockres_changed = 1;
- mlog_entry("type=%d\n", lock->ml.type);
- mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+ mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+ lock->ml.type, res->lockname.len,
res->lockname.name, flags);
+ /*
+ * Wait if resource is getting recovered, remastered, etc.
+ * If the resource was remastered and new owner is self, then exit.
+ */
spin_lock(&res->spinlock);
-
- /* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
+ if (res->owner == dlm->node_num) {
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ }
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
@@ -270,7 +270,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
}
dlm_revert_pending_lock(res, lock);
dlm_lock_put(lock);
- } else if (dlm_is_recovery_lock(res->lockname.name,
+ } else if (dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
/* special case for the $RECOVERY lock.
* there will never be an AST delivered to put
@@ -306,8 +306,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
int tmpret, status = 0;
enum dlm_status ret;
- mlog_entry_void();
-
memset(&create, 0, sizeof(create));
create.node_idx = dlm->node_num;
create.requested_type = lock->ml.type;
@@ -319,25 +317,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
+ ret = status;
if (ret == DLM_REJECTED) {
- mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
- "no longer owned by %u. that node is coming back "
- "up currently.\n", dlm->name, create.namelen,
+ mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+ "owned by node %u. That node is coming back up "
+ "currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
- mlog_errno(tmpret);
- if (dlm_is_host_down(tmpret)) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+ "node %u\n", dlm->name, create.namelen, create.name,
+ tmpret, res->owner);
+ if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
- mlog(0, "node %u died so returning DLM_RECOVERING "
- "from lock message!\n", res->owner);
- } else {
+ else
ret = dlm_err_to_dlm_status(tmpret);
- }
}
return ret;
@@ -430,7 +426,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lock *lock;
int kernel_allocated = 0;
- lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+ lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
@@ -438,7 +434,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
@@ -473,8 +469,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
BUG_ON(!dlm);
- mlog_entry_void();
-
if (!dlm_grab(dlm))
return DLM_REJECTED;
@@ -718,18 +712,10 @@ retry_lock:
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
- mlog(0, "retrying lock with migration/"
- "recovery/in progress\n");
msleep(100);
- /* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
-
- mlog(0, "%s: got RECOVERING "
- "for $RECOVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
@@ -741,6 +727,14 @@ retry_lock:
}
}
+ /* Inflight taken in dlm_get_lock_resource() is dropped here */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ dlm_lockres_calc_usage(dlm, res);
+ dlm_kick_thread(dlm, res);
+
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4d..82abf0cc9a1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -83,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-static struct kmem_cache *dlm_lockres_cache = NULL;
-static struct kmem_cache *dlm_lockname_cache = NULL;
-static struct kmem_cache *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
@@ -343,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
{
struct dlm_master_list_entry *tmpmle;
struct hlist_head *bucket;
- struct hlist_node *list;
unsigned int hash;
assert_spin_locked(&dlm->master_lock);
hash = dlm_lockid_hash(name, namelen);
bucket = dlm_master_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- tmpmle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
continue;
dlm_get_mle(tmpmle);
@@ -367,7 +363,7 @@ void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
struct dlm_master_list_entry *mle;
assert_spin_locked(&dlm->spinlock);
-
+
list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
if (node_up)
dlm_mle_node_up(dlm, mle, NULL, idx);
@@ -427,8 +423,6 @@ static void dlm_mle_release(struct kref *kref)
struct dlm_master_list_entry *mle;
struct dlm_ctxt *dlm;
- mlog_entry_void();
-
mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
dlm = mle->dlm;
@@ -478,11 +472,15 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache)
+ if (dlm_lockname_cache) {
kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
+ }
- if (dlm_lockres_cache)
+ if (dlm_lockres_cache) {
kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
+ }
}
static void dlm_lockres_release(struct kref *kref)
@@ -512,8 +510,6 @@ static void dlm_lockres_release(struct kref *kref)
atomic_dec(&dlm->res_cur_count);
- dlm_put(dlm);
-
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -585,9 +581,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
+ res->inflight_assert_workers = 0;
- /* put in dlm_lockres_release */
- dlm_grab(dlm);
res->dlm = dlm;
kref_init(&res->refs);
@@ -618,13 +613,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
{
struct dlm_lock_resource *res = NULL;
- res = (struct dlm_lock_resource *)
- kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+ res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
if (!res)
goto error;
- res->lockname.name = (char *)
- kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+ res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
if (!res->lockname.name)
goto error;
@@ -640,42 +633,94 @@ error:
return NULL;
}
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
{
- if (!new_lockres)
- assert_spin_locked(&res->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ clear_bit(bit, res->refmap);
+}
+
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
- if (!test_bit(dlm->node_num, res->refmap)) {
- BUG_ON(res->inflight_locks != 0);
- dlm_lockres_set_refmap_bit(dlm->node_num, res);
- }
res->inflight_locks++;
- mlog(0, "%s:%.*s: inflight++: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
+
+ mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
}
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_locks == 0);
+
res->inflight_locks--;
- mlog(0, "%s:%.*s: inflight--: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
- if (res->inflight_locks == 0)
- dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+
+ mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+
wake_up(&res->wq);
}
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ res->inflight_assert_workers++;
+ mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ BUG_ON(res->inflight_assert_workers == 0);
+ res->inflight_assert_workers--;
+ mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_drop_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
/*
* lookup a lock resource by name.
* may already exist in the hashtable.
@@ -706,7 +751,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
- int drop_inflight_if_nonlocal = 0;
BUG_ON(!lockid);
@@ -718,36 +762,33 @@ lookup:
spin_lock(&dlm->spinlock);
tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
- int dropping_ref = 0;
-
spin_unlock(&dlm->spinlock);
-
spin_lock(&tmpres->spinlock);
- /* We wait for the other thread that is mastering the resource */
+ /* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
}
- if (tmpres->owner == dlm->node_num) {
- BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
- dlm_lockres_grab_inflight_ref(dlm, tmpres);
- } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
- dropping_ref = 1;
- spin_unlock(&tmpres->spinlock);
-
- /* wait until done messaging the master, drop our ref to allow
- * the lockres to be purged, start over. */
- if (dropping_ref) {
- spin_lock(&tmpres->spinlock);
- __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+ /* Wait on the resource purge to complete before continuing */
+ if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+ BUG_ON(tmpres->owner == dlm->node_num);
+ __dlm_wait_on_lockres_flags(tmpres,
+ DLM_LOCK_RES_DROPPING_REF);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
- mlog(0, "found in hash!\n");
+ /* Grab inflight ref to pin the resource */
+ dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+ spin_unlock(&tmpres->spinlock);
if (res)
dlm_lockres_put(res);
res = tmpres;
@@ -758,8 +799,7 @@ lookup:
spin_unlock(&dlm->spinlock);
mlog(0, "allocating a new resource\n");
/* nothing found and we need to allocate one. */
- alloc_mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!alloc_mle)
goto leave;
res = dlm_new_lockres(dlm, lockid, namelen);
@@ -818,7 +858,7 @@ lookup:
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
mle = NULL;
- /* this is lame, but we cant wait on either
+ /* this is lame, but we can't wait on either
* the mle or lockres waitqueue here */
if (mig)
msleep(100);
@@ -834,13 +874,13 @@ lookup:
__dlm_insert_mle(dlm, mle);
/* still holding the dlm spinlock, check the recovery map
- * to see if there are any nodes that still need to be
+ * to see if there are any nodes that still need to be
* considered. these will not appear in the mle nodemap
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
@@ -853,12 +893,11 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* since this lockres is new it doesnt not require the spinlock */
- dlm_lockres_grab_inflight_ref_new(dlm, res);
- /* if this node does not become the master make sure to drop
- * this inflight reference below */
- drop_inflight_if_nonlocal = 1;
+ /* Grab inflight ref to pin the resource */
+ spin_lock(&res->spinlock);
+ dlm_lockres_grab_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -874,8 +913,8 @@ redo_request:
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
- mlog(ML_NOTICE, "%s: recovery map is not empty, but "
- "must master $RECOVERY lock now\n", dlm->name);
+ mlog(0, "%s: Recovery map is not empty, but must "
+ "master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
@@ -884,7 +923,7 @@ redo_request:
msleep(500);
}
continue;
- }
+ }
dlm_kick_recovery_thread(dlm);
msleep(1000);
@@ -893,8 +932,8 @@ redo_request:
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
@@ -923,8 +962,8 @@ redo_request:
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
- mlog(0, "%s:%.*s: requests only up to %u but master "
- "is %u, keep going\n", dlm->name, namelen,
+ mlog(0, "%s: res %.*s, Requests only up to %u but "
+ "master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
@@ -934,14 +973,13 @@ wait:
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
- mlog(0, "%s:%.*s: node map changed, redo the "
- "master request now, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Node map changed, redo the master "
+ "request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
- mlog(ML_ERROR, "%s:%.*s: spinning on "
- "dlm_wait_for_lock_mastery, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+ "dlm_wait_for_lock_mastery, blocked = %d\n",
+ dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
dlm_print_one_mle(mle);
@@ -950,7 +988,8 @@ wait:
goto redo_request;
}
- mlog(0, "lockres mastered by %u\n", res->owner);
+ mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -962,8 +1001,6 @@ wait:
wake_waiters:
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
- dlm_lockres_drop_inflight_ref(dlm, res);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -1030,7 +1067,7 @@ recheck:
ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
b = (mle->type == DLM_MLE_BLOCK);
if ((*blocked && !b) || (!*blocked && b)) {
- mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
+ mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
dlm->name, res->lockname.len, res->lockname.name,
*blocked, b);
*blocked = b;
@@ -1436,9 +1473,7 @@ way_up_top:
}
if (res->owner == dlm->node_num) {
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name, request->node_idx);
- dlm_lockres_set_refmap_bit(request->node_idx, res);
+ dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
response = DLM_MASTER_RESP_YES;
if (mle)
@@ -1503,10 +1538,8 @@ way_up_top:
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
- dlm_lockres_set_refmap_bit(request->node_idx, res);
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name,
- request->node_idx);
+ dlm_lockres_set_refmap_bit(dlm, res,
+ request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
@@ -1543,8 +1576,7 @@ way_up_top:
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
response = DLM_MASTER_RESP_ERROR;
mlog_errno(-ENOMEM);
@@ -1603,13 +1635,14 @@ send_response:
}
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
- ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
+ ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
DLM_ASSERT_MASTER_MLE_CLEANUP);
if (ret < 0) {
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
- }
+ } else
+ dlm_lockres_grab_inflight_worker(dlm, res);
} else {
if (res)
dlm_lockres_put(res);
@@ -1667,7 +1700,9 @@ again:
tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
&assert, sizeof(assert), to, &r);
if (tmpret < 0) {
- mlog(0, "assert_master returned %d!\n", tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", tmpret,
+ DLM_ASSERT_MASTER_MSG, dlm->key, to);
if (!dlm_is_host_down(tmpret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
BUG();
@@ -1702,7 +1737,7 @@ again:
if (r & DLM_ASSERT_RESPONSE_REASSERT) {
mlog(0, "%.*s: node %u create mles on other "
- "nodes and requests a re-assert\n",
+ "nodes and requests a re-assert\n",
namelen, lockname, to);
reassert = 1;
}
@@ -1711,7 +1746,7 @@ again:
"lockres, set the bit in the refmap\n",
namelen, lockname, to);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(to, res);
+ dlm_lockres_set_refmap_bit(dlm, res, to);
spin_unlock(&res->spinlock);
}
}
@@ -1813,7 +1848,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
goto done;
- }
+ }
}
}
spin_unlock(&dlm->master_lock);
@@ -1876,7 +1911,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
ok:
spin_unlock(&res->spinlock);
}
- spin_unlock(&dlm->spinlock);
// mlog(0, "woo! got an assert_master from node %u!\n",
// assert->node_idx);
@@ -1884,7 +1918,7 @@ ok:
int extra_ref = 0;
int nn = -1;
int rr, err = 0;
-
+
spin_lock(&mle->spinlock);
if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
extra_ref = 1;
@@ -1892,10 +1926,12 @@ ok:
/* MASTER mle: if any bits set in the response map
* then the calling node needs to re-assert to clear
* up nodes that this node contacted */
- while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
+ while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
- if (nn != dlm->node_num && nn != assert->node_idx)
+ if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
+ break;
+ }
}
}
mle->master = assert->node_idx;
@@ -1927,7 +1963,6 @@ ok:
/* master is known, detach if not already detached.
* ensures that only one assert_master call will happen
* on this mle. */
- spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
rr = atomic_read(&mle->mle_refs.refcount);
@@ -1960,7 +1995,6 @@ ok:
__dlm_put_mle(mle);
}
spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
} else if (res) {
if (res->owner != assert->node_idx) {
mlog(0, "assert_master from %u, but current "
@@ -1968,6 +2002,7 @@ ok:
res->owner, namelen, name);
}
}
+ spin_unlock(&dlm->spinlock);
done:
ret = 0;
@@ -2003,7 +2038,7 @@ kill:
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
- *ret_data = (void *)res;
+ *ret_data = (void *)res;
dlm_put(dlm);
return -EINVAL;
}
@@ -2027,7 +2062,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
int ignore_higher, u8 request_from, u32 flags)
{
struct dlm_work_item *item;
- item = kzalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_ATOMIC);
if (!item)
return -ENOMEM;
@@ -2041,10 +2076,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
item->u.am.request_from = request_from;
item->u.am.flags = flags;
- if (ignore_higher)
- mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
+ if (ignore_higher)
+ mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
res->lockname.name);
-
+
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
@@ -2122,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
dlm_lockres_release_ast(dlm, res);
put:
+ dlm_lockres_drop_inflight_worker(dlm, res);
+
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
@@ -2134,7 +2171,7 @@ put:
* think that $RECOVERY is currently mastered by a dead node. If so,
* we wait a short time to allow that node to get notified by its own
* heartbeat stack, then check again. All $RECOVERY lock resources
- * mastered by dead nodes are purged when the hearbeat callback is
+ * mastered by dead nodes are purged when the hearbeat callback is
* fired, so we can know for sure that it is safe to continue once
* the node returns a live node or no node. */
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
@@ -2175,7 +2212,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
ret = -EAGAIN;
}
spin_unlock(&dlm->spinlock);
- mlog(0, "%s: reco lock master is %u\n", dlm->name,
+ mlog(0, "%s: reco lock master is %u\n", dlm->name,
master);
break;
}
@@ -2198,8 +2235,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
- mlog(0, "%s:%.*s: sending deref to %d\n",
- dlm->name, namelen, lockname, res->owner);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
@@ -2208,12 +2243,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+ dlm->name, namelen, lockname, ret, res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
- mlog(ML_ERROR,"while dropping ref on %s:%.*s "
- "(master=%u) got %d.\n", dlm->name, namelen,
- lockname, res->owner, r);
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
BUG();
}
@@ -2269,7 +2304,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
else {
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
}
@@ -2329,7 +2364,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
@@ -2348,55 +2383,59 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
dlm_lockres_put(res);
}
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int *numlocks)
+ struct dlm_lock_resource *res)
{
- int ret;
- int i;
- int count = 0;
+ enum dlm_lockres_list idx;
+ int nonlocal = 0, node_ref;
struct list_head *queue;
struct dlm_lock *lock;
+ u64 cookie;
assert_spin_locked(&res->spinlock);
- ret = -EINVAL;
- if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
- mlog(0, "cannot migrate lockres with unknown owner!\n");
- goto leave;
- }
+ /* delay migration when the lockres is in MIGRATING state */
+ if (res->state & DLM_LOCK_RES_MIGRATING)
+ return 0;
- if (res->owner != dlm->node_num) {
- mlog(0, "cannot migrate lockres this node doesn't own!\n");
- goto leave;
- }
+ if (res->owner != dlm->node_num)
+ return 0;
- ret = 0;
- queue = &res->granted;
- for (i = 0; i < 3; i++) {
+ for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+ queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
- ++count;
- if (lock->ml.node == dlm->node_num) {
- mlog(0, "found a lock owned by this node still "
- "on the %s queue! will not migrate this "
- "lockres\n", (i == 0 ? "granted" :
- (i == 1 ? "converting" :
- "blocked")));
- ret = -ENOTEMPTY;
- goto leave;
+ if (lock->ml.node != dlm->node_num) {
+ nonlocal++;
+ continue;
}
+ cookie = be64_to_cpu(lock->ml.cookie);
+ mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+ "%s list\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(cookie),
+ dlm_get_lock_cookie_seq(cookie),
+ dlm_list_in_text(idx));
+ return 0;
}
- queue++;
}
- *numlocks = count;
- mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+ if (!nonlocal) {
+ node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (node_ref >= O2NM_MAX_NODES)
+ return 0;
+ }
-leave:
- return ret;
+ mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+
+ return 1;
}
/*
@@ -2405,8 +2444,7 @@ leave:
static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- u8 target)
+ struct dlm_lock_resource *res, u8 target)
{
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *oldmle = NULL;
@@ -2415,39 +2453,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
const char *name;
unsigned int namelen;
int mle_added = 0;
- int numlocks;
int wake = 0;
if (!dlm_grab(dlm))
return -EINVAL;
+ BUG_ON(target == O2NM_MAX_NODES);
+
name = res->lockname.name;
namelen = res->lockname.len;
- mlog(0, "migrating %.*s to %u\n", namelen, name, target);
-
- /*
- * ensure this lockres is a proper candidate for migration
- */
- spin_lock(&res->spinlock);
- ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
- if (ret < 0) {
- spin_unlock(&res->spinlock);
- goto leave;
- }
- spin_unlock(&res->spinlock);
-
- /* no work to do */
- if (numlocks == 0) {
- mlog(0, "no locks were found on this lockres! done!\n");
- goto leave;
- }
-
- /*
- * preallocate up front
- * if this fails, abort
- */
+ mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+ target);
+ /* preallocate up front. if this fails, abort */
ret = -ENOMEM;
mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
@@ -2455,8 +2474,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
goto leave;
}
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
mlog_errno(ret);
goto leave;
@@ -2464,35 +2482,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
ret = 0;
/*
- * find a node to migrate the lockres to
- */
-
- mlog(0, "picking a migration node\n");
- spin_lock(&dlm->spinlock);
- /* pick a new node */
- if (!test_bit(target, dlm->domain_map) ||
- target >= O2NM_MAX_NODES) {
- target = dlm_pick_migration_target(dlm, res);
- }
- mlog(0, "node %u chosen for migration\n", target);
-
- if (target >= O2NM_MAX_NODES ||
- !test_bit(target, dlm->domain_map)) {
- /* target chosen is not alive */
- ret = -EINVAL;
- }
-
- if (ret) {
- spin_unlock(&dlm->spinlock);
- goto fail;
- }
-
- mlog(0, "continuing with target = %u\n", target);
-
- /*
* clear any existing master requests and
* add the migration mle to the list
*/
+ spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
@@ -2533,6 +2526,7 @@ fail:
dlm_put_mle(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
+ mle = NULL;
}
goto leave;
}
@@ -2576,6 +2570,9 @@ fail:
res->state &= ~DLM_LOCK_RES_MIGRATING;
wake = 1;
spin_unlock(&res->spinlock);
+ if (dlm_is_host_down(ret))
+ dlm_wait_for_node_death(dlm, target,
+ DLM_NODE_DEATH_WAIT_MAX);
goto leave;
}
@@ -2587,7 +2584,7 @@ fail:
* is complete everywhere. if the target dies while this is
* going on, some nodes could potentially see the target as the
* master, so it is important that my recovery finds the migration
- * mle and sets the master to UNKNONWN. */
+ * mle and sets the master to UNKNOWN. */
/* wait for new node to assert master */
@@ -2603,7 +2600,7 @@ fail:
mlog(0, "%s:%.*s: timed out during migration\n",
dlm->name, res->lockname.len, res->lockname.name);
- /* avoid hang during shutdown when migrating lockres
+ /* avoid hang during shutdown when migrating lockres
* to a node which also goes down */
if (dlm_is_node_dead(dlm, target)) {
mlog(0, "%s:%.*s: expected migration "
@@ -2651,69 +2648,52 @@ leave:
if (wake)
wake_up(&res->wq);
- /* TODO: cleanup */
if (mres)
free_page((unsigned long)mres);
dlm_put(dlm);
- mlog(0, "returning %d\n", ret);
+ mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+ name, target, ret);
return ret;
}
#define DLM_MIGRATION_RETRY_MS 100
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
* and there should be no local locks left on locally mastered resources.
*
* Called with the dlm spinlock held, may drop it to do migration, but
* will re-acquire before exit.
*
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
int ret;
int lock_dropped = 0;
- int numlocks;
+ u8 target = O2NM_MAX_NODES;
+
+ assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num) {
- if (!__dlm_lockres_unused(res)) {
- mlog(ML_ERROR, "%s:%.*s: this node is not master, "
- "trying to free this but locks remain\n",
- dlm->name, res->lockname.len, res->lockname.name);
- }
- spin_unlock(&res->spinlock);
- goto leave;
- }
+ if (dlm_is_lockres_migrateable(dlm, res))
+ target = dlm_pick_migration_target(dlm, res);
+ spin_unlock(&res->spinlock);
- /* No need to migrate a lockres having no locks */
- ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
- if (ret >= 0 && numlocks == 0) {
- spin_unlock(&res->spinlock);
+ if (target == O2NM_MAX_NODES)
goto leave;
- }
- spin_unlock(&res->spinlock);
/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
spin_unlock(&dlm->spinlock);
lock_dropped = 1;
- while (1) {
- ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
- if (ret >= 0)
- break;
- if (ret == -ENOTEMPTY) {
- mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
- res->lockname.len, res->lockname.name);
- BUG();
- }
-
- mlog(0, "lockres %.*s: migrate failed, "
- "retrying\n", res->lockname.len,
- res->lockname.name);
- msleep(DLM_MIGRATION_RETRY_MS);
- }
+ ret = dlm_migrate_lockres(dlm, res, target);
+ if (ret)
+ mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ target, ret);
spin_lock(&dlm->spinlock);
leave:
return lock_dropped;
@@ -2739,7 +2719,7 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
spin_unlock(&res->spinlock);
- /* target has died, so make the caller break out of the
+ /* target has died, so make the caller break out of the
* wait_event, but caller must recheck the domain_map */
spin_lock(&dlm->spinlock);
if (!test_bit(mig_target, dlm->domain_map))
@@ -2812,14 +2792,8 @@ again:
mlog(0, "trying again...\n");
goto again;
}
- /* now that we are sure the MIGRATING state is there, drop
- * the unneded state which blocked threads trying to DIRTY */
- spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
- BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
- res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
- spin_unlock(&res->spinlock);
+ ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
@@ -2830,9 +2804,21 @@ again:
spin_unlock(&dlm->spinlock);
/*
+ * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+ * another try; otherwise, we are sure the MIGRATING state is there,
+ * drop the unneded state which blocked threads trying to DIRTY
+ */
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+ res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+ if (!ret)
+ BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ spin_unlock(&res->spinlock);
+
+ /*
* at this point:
*
- * o the DLM_LOCK_RES_MIGRATING flag is set
+ * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
@@ -2864,7 +2850,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
- dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
/* In a normal unlock, we would have added a
@@ -2885,61 +2872,61 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: node %u had a ref to this "
"migrating lockres, clearing\n", dlm->name,
res->lockname.len, res->lockname.name, bit);
- dlm_lockres_clear_refmap_bit(bit, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, bit);
}
bit++;
}
}
-/* for now this is not too intelligent. we will
- * need stats to make this do the right thing.
- * this just finds the first lock on one of the
- * queues and uses that node as the target. */
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- int i;
+ enum dlm_lockres_list idx;
struct list_head *queue = &res->granted;
struct dlm_lock *lock;
- int nodenum;
+ int noderef;
+ u8 nodenum = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
- spin_lock(&res->spinlock);
- for (i=0; i<3; i++) {
+ /* Go through all the locks */
+ for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+ queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
- /* up to the caller to make sure this node
- * is alive */
- if (lock->ml.node != dlm->node_num) {
- spin_unlock(&res->spinlock);
- return lock->ml.node;
- }
+ if (lock->ml.node == dlm->node_num)
+ continue;
+ if (test_bit(lock->ml.node, dlm->exit_domain_map))
+ continue;
+ nodenum = lock->ml.node;
+ goto bail;
}
- queue++;
}
- spin_unlock(&res->spinlock);
- mlog(0, "have not found a suitable target yet! checking domain map\n");
- /* ok now we're getting desperate. pick anyone alive. */
- nodenum = -1;
+ /* Go thru the refmap */
+ noderef = -1;
while (1) {
- nodenum = find_next_bit(dlm->domain_map,
- O2NM_MAX_NODES, nodenum+1);
- mlog(0, "found %d in domain map\n", nodenum);
- if (nodenum >= O2NM_MAX_NODES)
+ noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+ noderef + 1);
+ if (noderef >= O2NM_MAX_NODES)
break;
- if (nodenum != dlm->node_num) {
- mlog(0, "picking %d\n", nodenum);
- return nodenum;
- }
+ if (noderef == dlm->node_num)
+ continue;
+ if (test_bit(noderef, dlm->exit_domain_map))
+ continue;
+ nodenum = noderef;
+ goto bail;
}
- mlog(0, "giving up. no master to migrate to\n");
- return DLM_LOCK_RES_OWNER_UNKNOWN;
+bail:
+ return nodenum;
}
-
-
/* this is called by the new master once all lockres
* data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
@@ -2978,7 +2965,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
- mlog(0, "migrate_request returned %d!\n", ret);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+ "MIGRATE_REQUEST to node %u\n", dlm->name,
+ migrate.namelen, migrate.name, ret, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
@@ -2997,7 +2986,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
dlm->name, res->lockname.len, res->lockname.name,
nodenum);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(nodenum, res);
+ dlm_lockres_set_refmap_bit(dlm, res, nodenum);
spin_unlock(&res->spinlock);
}
}
@@ -3036,8 +3025,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
hash = dlm_lockid_hash(name, namelen);
/* preallocate.. if this fails, abort */
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
ret = -ENOMEM;
@@ -3047,8 +3035,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
- spin_lock(&dlm->master_lock);
-
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3066,14 +3052,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&res->spinlock);
}
+ spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
-unlock:
spin_unlock(&dlm->master_lock);
+unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
@@ -3108,8 +3095,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
*oldmle = NULL;
- mlog_entry_void();
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
@@ -3144,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* remove it so that only one mle will be found */
__dlm_unlink_mle(dlm, tmp);
__dlm_mle_detach_hb_events(dlm, tmp);
- ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
- mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
- "telling master to get ref for cleared out mle "
- "during migration\n", dlm->name, namelen, name,
- master, new_master);
+ if (tmp->type == DLM_MLE_MASTER) {
+ ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+ "telling master to get ref "
+ "for cleared out mle during "
+ "migration\n", dlm->name,
+ namelen, name, master,
+ new_master);
+ }
}
spin_unlock(&tmp->spinlock);
}
@@ -3246,10 +3235,10 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_master_list_entry *mle;
struct dlm_lock_resource *res;
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct hlist_node *tmp;
unsigned int i;
- mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+ mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
top:
assert_spin_locked(&dlm->spinlock);
@@ -3257,10 +3246,7 @@ top:
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each(list, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
-
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
BUG_ON(mle->type != DLM_MLE_BLOCK &&
mle->type != DLM_MLE_MASTER &&
mle->type != DLM_MLE_MIGRATION);
@@ -3335,7 +3321,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* mastery reference here since old_master will briefly have
* a reference after the migration completes */
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(old_master, res);
+ dlm_lockres_set_refmap_bit(dlm, res, old_master);
spin_unlock(&res->spinlock);
mlog(0, "now time to do a migrate request to other nodes\n");
@@ -3435,3 +3421,41 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
wake_up(&res->wq);
wake_up(&dlm->migration_wq);
}
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+ int i;
+ struct hlist_head *bucket;
+ struct dlm_master_list_entry *mle;
+ struct hlist_node *tmp;
+
+ /*
+ * We notified all other nodes that we are exiting the domain and
+ * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+ * around we force free them and wake any processes that are waiting
+ * on the mles
+ */
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+
+ BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+ BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = dlm_master_hash(dlm, i);
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
+ if (mle->type != DLM_MLE_BLOCK) {
+ mlog(ML_ERROR, "bad mle: %p\n", mle);
+ dlm_print_one_mle(mle);
+ }
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+
+ __dlm_unlink_mle(dlm, mle);
+ __dlm_mle_detach_hb_events(dlm, mle);
+ __dlm_put_mle(mle);
+ }
+ }
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c373..45067faf569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -56,9 +55,6 @@
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
static int dlm_do_recovery(struct dlm_ctxt *dlm);
static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -311,7 +307,7 @@ static int dlm_recovery_thread(void *data)
mlog(0, "dlm thread running for %s...\n", dlm->name);
while (!kthread_should_stop()) {
- if (dlm_joined(dlm)) {
+ if (dlm_domain_fully_joined(dlm)) {
status = dlm_do_recovery(dlm);
if (status == -EAGAIN) {
/* do not sleep, recheck immediately. */
@@ -363,40 +359,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
}
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(ML_NOTICE, "%s: waiting %dms for notification of "
- "death of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_dead(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
- "of death of node %u\n", dlm->name, node);
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(0, "%s: waiting %dms for notification of "
- "recovery of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_recovered(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_recovered(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(0, "%s: waiting indefinitely for notification "
- "of recovery of node %u\n", dlm->name, node);
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_recovered(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -431,6 +425,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+ printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+ dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
}
@@ -441,9 +437,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
+ printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
wake_up(&dlm->reco.event);
}
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+ printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+ "dead node %u in domain %s\n", dlm->reco.new_master,
+ (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+ dlm->reco.dead_node, dlm->name);
+}
+
static int dlm_do_recovery(struct dlm_ctxt *dlm)
{
int status = 0;
@@ -464,7 +469,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+ bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
@@ -506,9 +511,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
- dlm->node_num, dlm->reco.dead_node);
+
+ dlm_print_recovery_master(dlm);
/* it is safe to start everything back up here
* because all of the dead node's lock resources
@@ -519,15 +523,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
- "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
- dlm->node_num, dlm->reco.dead_node, dlm->name);
+ dlm_print_recovery_master(dlm);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
/* we should never hit this anymore */
- mlog(ML_ERROR, "error %d remastering locks for node %u, "
- "retrying.\n", status, dlm->reco.dead_node);
+ mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+ "retrying.\n", dlm->name, status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
* to get handled on remaining nodes */
msleep(100);
@@ -535,7 +537,10 @@ master_here:
/* success! see if any other nodes need recovery */
mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
dlm->name, dlm->reco.dead_node, dlm->node_num);
- dlm_reset_recovery(dlm);
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
}
dlm_end_recovery(dlm);
@@ -568,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
- mlog(0, "requesting lock info from node %u\n",
+ mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
ndata->node_num);
if (ndata->node_num == dlm->node_num) {
@@ -641,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
spin_unlock(&dlm_reco_state_lock);
}
- mlog(0, "done requesting all lock info\n");
+ mlog(0, "%s: Done requesting all lock info\n", dlm->name);
/* nodes should be sending reco data now
* just need to wait */
@@ -693,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (all_nodes_done) {
int ret;
+ /* Set this flag on recovery master to avoid
+ * a new recovery for another dead node start
+ * before the recovery is not done. That may
+ * cause recovery hung.*/
+ spin_lock(&dlm->spinlock);
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
* just send a finalize message to everyone and
* clean up */
@@ -728,7 +741,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (destroy)
dlm_destroy_recovery_area(dlm, dead_node);
- mlog_exit(status);
return status;
}
@@ -785,7 +797,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
u8 dead_node)
{
struct dlm_lock_request lr;
- enum dlm_status ret;
+ int ret;
+ int status;
mlog(0, "\n");
@@ -798,14 +811,16 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
lr.dead_node = dead_node;
// send message
- ret = DLM_NOLOCKMGR;
ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
- &lr, sizeof(lr), request_from, NULL);
+ &lr, sizeof(lr), request_from, &status);
/* negative status is handled by caller */
if (ret < 0)
- mlog_errno(ret);
-
+ mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+ "to recover dead node %u\n", dlm->name, ret,
+ request_from, dead_node);
+ else
+ ret = status;
// return from here, then
// sleep until all received or error
return ret;
@@ -956,10 +971,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
+ mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+ "to recover dead node %u\n", dlm->name, ret, send_to,
+ dead_node);
if (!dlm_is_host_down(ret)) {
- mlog_errno(ret);
- mlog(ML_ERROR, "%s: unknown error sending data-done "
- "to %u\n", dlm->name, send_to);
BUG();
}
} else
@@ -1051,7 +1066,7 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
if (lock->ml.node == dead_node) {
mlog(0, "AHA! there was "
"a $RECOVERY lock for dead "
- "node %u (%s)!\n",
+ "node %u (%s)!\n",
dead_node, dlm->name);
list_del_init(&lock->list);
dlm_lock_put(lock);
@@ -1118,7 +1133,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
dlm->name, res->lockname.len, res->lockname.name,
- orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+ orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
send_to);
/* send it */
@@ -1127,7 +1142,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+ "node %u (%s)\n", dlm->name, mres->lockname_len,
+ mres->lockname, ret, send_to,
+ (orig_flags & DLM_MRES_MIGRATION ?
+ "migration" : "recovery"));
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1165,6 +1184,39 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
mres->master = master;
}
+static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
+ struct dlm_migratable_lockres *mres,
+ int queue)
+{
+ if (!lock->lksb)
+ return;
+
+ /* Ignore lvb in all locks in the blocked list */
+ if (queue == DLM_BLOCKED_LIST)
+ return;
+
+ /* Only consider lvbs in locks with granted EX or PR lock levels */
+ if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
+ return;
+
+ if (dlm_lvb_is_empty(mres->lvb)) {
+ memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+ return;
+ }
+
+ /* Ensure the lvb copied for migration matches in other valid locks */
+ if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
+ return;
+
+ mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
+ "node=%u\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->lockres->lockname.len, lock->lockres->lockname.name,
+ lock->ml.node);
+ dlm_print_one_lock_resource(lock->lockres);
+ BUG();
+}
/* returns 1 if this lock fills the network structure,
* 0 otherwise */
@@ -1182,20 +1234,7 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
ml->list = queue;
if (lock->lksb) {
ml->flags = lock->lksb->flags;
- /* send our current lvb */
- if (ml->type == LKM_EXMODE ||
- ml->type == LKM_PRMODE) {
- /* if it is already set, this had better be a PR
- * and it has to match */
- if (!dlm_lvb_is_empty(mres->lvb) &&
- (ml->type == LKM_EXMODE ||
- memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
- mlog(ML_ERROR, "mismatched lvbs!\n");
- dlm_print_one_lock_resource(lock->lockres);
- BUG();
- }
- memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
- }
+ dlm_prepare_lvb_for_migration(lock, mres, queue);
}
ml->node = lock->ml.node;
mres->num_locks++;
@@ -1379,6 +1418,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
@@ -1469,13 +1509,11 @@ leave:
dlm_put(dlm);
if (ret < 0) {
- if (buf)
- kfree(buf);
- if (item)
- kfree(item);
+ kfree(buf);
+ kfree(item);
+ mlog_errno(ret);
}
- mlog_exit(ret);
return ret;
}
@@ -1544,7 +1582,6 @@ leave:
dlm_lockres_put(res);
}
kfree(data);
- mlog_exit(ret);
}
@@ -1623,7 +1660,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
&req, sizeof(req), nodenum, &status);
/* XXX: negative status not handled properly here. */
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+ dlm->key, nodenum);
else {
BUG_ON(status < 0);
BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1669,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
mlog_errno(-ENOMEM);
/* retry!? */
BUG();
- }
+ } else
+ __dlm_lockres_grab_inflight_worker(dlm, res);
} else /* put.. incase we are not the master */
dlm_lockres_put(res);
spin_unlock(&res->spinlock);
@@ -1722,15 +1762,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
int i, j, bad;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
unsigned int added = 0;
+ __be64 c;
mlog(0, "running %d locks for this lockres\n", mres->num_locks);
for (i=0; i<mres->num_locks; i++) {
@@ -1743,7 +1784,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm->name, mres->lockname_len, mres->lockname,
from);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(from, res);
+ dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
added++;
break;
@@ -1762,14 +1803,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* MIGRATION ONLY! */
BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+ lock = NULL;
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each_entry(lock, tmpq, list) {
- if (lock->ml.cookie != ml->cookie)
- lock = NULL;
- else
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
+ if (lock->ml.cookie == ml->cookie)
break;
+ lock = NULL;
}
if (lock)
break;
@@ -1778,19 +1821,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* lock is always created locally first, and
* destroyed locally last. it must be on the list */
if (!lock) {
- __be64 c = ml->cookie;
- mlog(ML_ERROR, "could not find local lock "
- "with cookie %u:%llu!\n",
+ c = ml->cookie;
+ mlog(ML_ERROR, "Could not find local lock "
+ "with cookie %u:%llu, node %u, "
+ "list %u, flags 0x%x, type %d, "
+ "conv %d, highest blocked %d\n",
dlm_get_lock_cookie_node(be64_to_cpu(c)),
- dlm_get_lock_cookie_seq(be64_to_cpu(c)));
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ ml->node, ml->list, ml->flags, ml->type,
+ ml->convert_type, ml->highest_blocked);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ if (lock->ml.node != ml->node) {
+ c = lock->ml.cookie;
+ mlog(ML_ERROR, "Mismatched node# in lock "
+ "cookie %u:%llu, name %.*s, node %u\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ res->lockname.len, res->lockname.name,
+ lock->ml.node);
+ c = ml->cookie;
+ mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
+ "node %u, list %u, flags 0x%x, type %d, "
+ "conv %d, highest blocked %d\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ ml->node, ml->list, ml->flags, ml->type,
+ ml->convert_type, ml->highest_blocked);
__dlm_print_one_lock_resource(res);
BUG();
}
- BUG_ON(lock->ml.node != ml->node);
if (tmpq != queue) {
- mlog(0, "lock was on %u instead of %u for %.*s\n",
- j, ml->list, res->lockname.len, res->lockname.name);
+ c = ml->cookie;
+ mlog(0, "Lock cookie %u:%llu was on list %u "
+ "instead of list %u for %.*s\n",
+ dlm_get_lock_cookie_node(be64_to_cpu(c)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(c)),
+ j, ml->list, res->lockname.len,
+ res->lockname.name);
+ __dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
continue;
}
@@ -1828,6 +1900,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
if (ml->type == LKM_NLMODE)
goto skip_lvb;
+ /*
+ * If the lock is in the blocked list it can't have a valid lvb,
+ * so skip it
+ */
+ if (ml->list == DLM_BLOCKED_LIST)
+ goto skip_lvb;
+
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
@@ -1840,7 +1919,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
* the lvb. */
memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
} else {
- /* otherwise, the node is sending its
+ /* otherwise, the node is sending its
* most recent valid lvb info */
BUG_ON(ml->type != LKM_EXMODE &&
ml->type != LKM_PRMODE);
@@ -1887,7 +1966,7 @@ skip_lvb:
spin_lock(&res->spinlock);
list_for_each_entry(lock, queue, list) {
if (lock->ml.cookie == ml->cookie) {
- __be64 c = lock->ml.cookie;
+ c = lock->ml.cookie;
mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
"exists on this lockres!\n", dlm->name,
res->lockname.len, res->lockname.name,
@@ -1908,11 +1987,19 @@ skip_lvb:
}
if (!bad) {
dlm_lock_get(newlock);
- list_add_tail(&newlock->list, queue);
+ if (mres->flags & DLM_MRES_RECOVERY &&
+ ml->list == DLM_CONVERTING_LIST &&
+ newlock->ml.type >
+ newlock->ml.convert_type) {
+ /* newlock is doing downconvert, add it to the
+ * head of converting list */
+ list_add(&newlock->list, queue);
+ } else
+ list_add_tail(&newlock->list, queue);
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
- dlm_lockres_set_refmap_bit(ml->node, res);
+ dlm_lockres_set_refmap_bit(dlm, res, ml->node);
added++;
}
spin_unlock(&res->spinlock);
@@ -1931,7 +2018,6 @@ leave:
dlm_lock_put(newlock);
}
- mlog_exit(ret);
return ret;
}
@@ -1942,6 +2028,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct list_head *queue;
struct dlm_lock *lock, *next;
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
mlog(0,
@@ -2022,16 +2110,16 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
u8 dead_node, u8 new_master)
{
int i;
- struct hlist_node *hash_iter;
struct hlist_head *bucket;
struct dlm_lock_resource *res, *next;
- mlog_entry_void();
-
assert_spin_locked(&dlm->spinlock);
list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
/* new_master has our reference from
@@ -2052,41 +2140,31 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
* if necessary */
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
- if (res->state & DLM_LOCK_RES_RECOVERING) {
- if (res->owner == dead_node) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, but "
- "clearing state anyway\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else if (res->owner == dlm->node_num) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, "
- "owner is THIS node, clearing\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else
- continue;
+ hlist_for_each_entry(res, bucket, hash_node) {
+ if (!(res->state & DLM_LOCK_RES_RECOVERING))
+ continue;
- if (!list_empty(&res->recovering)) {
- mlog(0, "%s:%.*s: lockres was "
- "marked RECOVERING, owner=%u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
- list_del_init(&res->recovering);
- dlm_lockres_put(res);
- }
- spin_lock(&res->spinlock);
- /* new_master has our reference from
- * the lock state sent during recovery */
- dlm_change_lockres_owner(dlm, res, new_master);
- res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (__dlm_lockres_has_locks(res))
- __dlm_dirty_lockres(dlm, res);
- spin_unlock(&res->spinlock);
- wake_up(&res->wq);
+ if (res->owner != dead_node &&
+ res->owner != dlm->node_num)
+ continue;
+
+ if (!list_empty(&res->recovering)) {
+ list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
+
+ /* new_master has our reference from
+ * the lock state sent during recovery */
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
+ spin_lock(&res->spinlock);
+ dlm_change_lockres_owner(dlm, res, new_master);
+ res->state &= ~DLM_LOCK_RES_RECOVERING;
+ if (__dlm_lockres_has_locks(res))
+ __dlm_dirty_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
}
}
}
@@ -2115,7 +2193,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
if (res->owner == dlm->node_num)
- /* if this node owned the lockres, and if the dead node
+ /* if this node owned the lockres, and if the dead node
* had an EX when he died, blank out the lvb */
search_node = dead_node;
else {
@@ -2153,7 +2231,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
/* this node is the lockres master:
* 1) remove any stale locks for the dead node
- * 2) if the dead node had an EX when he died, blank out the lvb
+ * 2) if the dead node had an EX when he died, blank out the lvb
*/
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
@@ -2194,13 +2272,18 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
"dropping ref from lockres\n", dlm->name,
res->lockname.len, res->lockname.name, freed, dead_node);
- BUG_ON(!test_bit(dead_node, res->refmap));
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ if(!test_bit(dead_node, res->refmap)) {
+ mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
+ "but ref was not set\n", dlm->name,
+ res->lockname.len, res->lockname.name, freed, dead_node);
+ __dlm_print_one_lock_resource(res);
+ }
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
"no locks and had not purged before dying\n", dlm->name,
res->lockname.len, res->lockname.name, dead_node);
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
}
/* do not kick thread yet */
@@ -2216,7 +2299,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct hlist_node *iter;
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
@@ -2242,7 +2324,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, iter, bucket, hash_node) {
+ hlist_for_each_entry(res, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2261,27 +2343,31 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
}
spin_unlock(&res->spinlock);
continue;
- }
+ }
spin_lock(&res->spinlock);
/* zero the lvb if necessary */
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
- if (res->state & DLM_LOCK_RES_DROPPING_REF)
- mlog(0, "%s:%.*s: owned by "
- "dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s, Skip "
+ "recovery as it is being freed\n",
dlm->name, res->lockname.len,
- res->lockname.name, dead_node);
-
- /* the wake_up for this will happen when the
- * RECOVERING flag is dropped later */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ res->lockname.name);
+ } else
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
- dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
+ } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ if (test_bit(dead_node, res->refmap)) {
+ mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+ "no locks and had not purged before dying\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+ }
}
spin_unlock(&res->spinlock);
}
@@ -2340,6 +2426,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
mlog(0, "node %u being removed from domain map!\n", idx);
clear_bit(idx, dlm->domain_map);
+ clear_bit(idx, dlm->exit_domain_map);
/* wake up migration waiters if a node goes down.
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
@@ -2412,7 +2499,7 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
* this function on each node racing to become the recovery
* master will not stop attempting this until either:
* a) this node gets the EX (and becomes the recovery master),
- * or b) dlm->reco.new_master gets set to some nodenum
+ * or b) dlm->reco.new_master gets set to some nodenum
* != O2NM_INVALID_NODE_NUM (another node will do the reco).
* so each time a recovery master is needed, the entire cluster
* will sync at this point. if the new master dies, that will
@@ -2425,7 +2512,7 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
-again:
+again:
memset(&lksb, 0, sizeof(lksb));
ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
@@ -2438,8 +2525,8 @@ again:
if (ret == DLM_NORMAL) {
mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
dlm->name, dlm->node_num);
-
- /* got the EX lock. check to see if another node
+
+ /* got the EX lock. check to see if another node
* just became the reco master */
if (dlm_reco_master_ready(dlm)) {
mlog(0, "%s: got reco EX lock, but %u will "
@@ -2452,12 +2539,12 @@ again:
/* see if recovery was already finished elsewhere */
spin_lock(&dlm->spinlock);
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
- status = -EINVAL;
+ status = -EINVAL;
mlog(0, "%s: got reco EX lock, but "
"node got recovered already\n", dlm->name);
if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
mlog(ML_ERROR, "%s: new master is %u "
- "but no dead node!\n",
+ "but no dead node!\n",
dlm->name, dlm->reco.new_master);
BUG();
}
@@ -2469,7 +2556,7 @@ again:
* set the master and send the messages to begin recovery */
if (!status) {
mlog(0, "%s: dead=%u, this=%u, sending "
- "begin_reco now\n", dlm->name,
+ "begin_reco now\n", dlm->name,
dlm->reco.dead_node, dlm->node_num);
status = dlm_send_begin_reco_message(dlm,
dlm->reco.dead_node);
@@ -2502,7 +2589,7 @@ again:
mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
dlm->name, dlm->node_num);
/* another node is master. wait on
- * reco.new_master != O2NM_INVALID_NODE_NUM
+ * reco.new_master != O2NM_INVALID_NODE_NUM
* for at most one second */
wait_event_timeout(dlm->dlm_reco_thread_wq,
dlm_reco_master_ready(dlm),
@@ -2549,8 +2636,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
int nodenum;
int status;
- mlog_entry("%u\n", dead_node);
-
mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
spin_lock(&dlm->spinlock);
@@ -2586,17 +2671,32 @@ retry:
if (dlm_is_host_down(ret)) {
/* node is down. not involved in recovery
* so just keep going */
- mlog(0, "%s: node %u was down when sending "
+ mlog(ML_NOTICE, "%s: node %u was down when sending "
"begin reco msg (%d)\n", dlm->name, nodenum, ret);
ret = 0;
}
+
+ /*
+ * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
+ * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
+ * We are handling both for compatibility reasons.
+ */
+ if (ret == -EAGAIN || ret == EAGAIN) {
+ mlog(0, "%s: trying to start recovery of node "
+ "%u, but node %u is waiting for last recovery "
+ "to complete, backoff for a bit\n", dlm->name,
+ dead_node, nodenum);
+ msleep(100);
+ goto retry;
+ }
if (ret < 0) {
struct dlm_lock_resource *res;
- /* this is now a serious problem, possibly ENOMEM
+
+ /* this is now a serious problem, possibly ENOMEM
* in the network stack. must retry */
mlog_errno(ret);
mlog(ML_ERROR, "begin reco of dlm %s to node %u "
- " returned %d\n", dlm->name, nodenum, ret);
+ "returned %d\n", dlm->name, nodenum, ret);
res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
DLM_RECOVERY_LOCK_NAME_LEN);
if (res) {
@@ -2605,18 +2705,10 @@ retry:
} else {
mlog(ML_ERROR, "recovery lock not found\n");
}
- /* sleep for a bit in hopes that we can avoid
+ /* sleep for a bit in hopes that we can avoid
* another ENOMEM */
msleep(100);
goto retry;
- } else if (ret == EAGAIN) {
- mlog(0, "%s: trying to start recovery of node "
- "%u, but node %u is waiting for last recovery "
- "to complete, backoff for a bit\n", dlm->name,
- dead_node, nodenum);
- /* TODO Look into replacing msleep with cond_resched() */
- msleep(100);
- goto retry;
}
}
@@ -2640,7 +2732,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
dlm->name, br->node_idx, br->dead_node,
dlm->reco.dead_node, dlm->reco.new_master);
spin_unlock(&dlm->spinlock);
- return EAGAIN;
+ dlm_put(dlm);
+ return -EAGAIN;
}
spin_unlock(&dlm->spinlock);
@@ -2665,7 +2758,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
}
if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
- "node %u changing it to %u\n", dlm->name,
+ "node %u changing it to %u\n", dlm->name,
dlm->reco.dead_node, br->node_idx, br->dead_node);
}
dlm_set_reco_master(dlm, br->node_idx);
@@ -2729,10 +2822,12 @@ stage2:
if (ret >= 0)
ret = status;
if (ret < 0) {
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+ dlm->key, nodenum);
if (dlm_is_host_down(ret)) {
- /* this has no effect on this recovery
- * session, so set the status to zero to
+ /* this has no effect on this recovery
+ * session, so set the status to zero to
* finish out the last recovery */
mlog(ML_ERROR, "node %u went down after this "
"node finished recovery.\n", nodenum);
@@ -2769,7 +2864,7 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
mlog(0, "%s: node %u finalizing recovery stage%d of "
"node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
-
+
spin_lock(&dlm->spinlock);
if (dlm->reco.new_master != fr->node_idx) {
@@ -2809,8 +2904,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
BUG();
}
dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
- dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
default:
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d490b66ad9d..69aac6f088a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -94,19 +92,29 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
- if (!__dlm_lockres_has_locks(res) &&
- (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
- /* try not to scan the bitmap unless the first two
- * conditions are already true */
- int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
- if (bit >= O2NM_MAX_NODES) {
- /* since the bit for dlm->node_num is not
- * set, inflight_locks better be zero */
- BUG_ON(res->inflight_locks != 0);
- return 1;
- }
- }
- return 0;
+ int bit;
+
+ assert_spin_locked(&res->spinlock);
+
+ if (__dlm_lockres_has_locks(res))
+ return 0;
+
+ /* Locks are in the process of being created */
+ if (res->inflight_locks)
+ return 0;
+
+ if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+ return 0;
+
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
+ /* Another node has this resource with this node as the master */
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (bit < O2NM_MAX_NODES)
+ return 0;
+
+ return 1;
}
@@ -116,15 +124,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
if (list_empty(&res->purge)) {
- mlog(0, "putting lockres %.*s:%p onto purge list\n",
- res->lockname.len, res->lockname.name, res);
+ mlog(0, "%s: Adding res %.*s to purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
res->last_used = jiffies;
dlm_lockres_get(res);
@@ -132,8 +138,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
dlm->purge_count++;
}
} else if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
- res->lockname.len, res->lockname.name, res, res->owner);
+ mlog(0, "%s: Removing res %.*s from purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->purge);
dlm_lockres_put(res);
@@ -144,7 +150,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -154,45 +159,24 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
- spin_lock(&res->spinlock);
- if (!__dlm_lockres_unused(res)) {
- mlog(0, "%s:%.*s: tried to purge but not unused\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- spin_unlock(&res->spinlock);
- BUG();
- }
-
- if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "%s:%.*s: Delay dropref as this lockres is "
- "being remastered\n", dlm->name, res->lockname.len,
- res->lockname.name);
- /* Re-add the lockres to the end of the purge list */
- if (!list_empty(&res->purge)) {
- list_del_init(&res->purge);
- list_add_tail(&res->purge, &dlm->purge_list);
- }
- spin_unlock(&res->spinlock);
- return 0;
- }
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
- if (!master)
- res->state |= DLM_LOCK_RES_DROPPING_REF;
- spin_unlock(&res->spinlock);
-
- mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
- res->lockname.name, master);
+ mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, master);
if (!master) {
+ res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
+ spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -203,34 +187,38 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
- mlog_errno(ret);
if (!dlm_is_host_down(ret))
BUG();
}
- mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
- dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
}
if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s:%p from purgelist, "
- "master = %d\n", res->lockname.len, res->lockname.name,
- res, master);
+ mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+ dlm->name, res->lockname.len, res->lockname.name, master);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
- __dlm_unhash_lockres(res);
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
- spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- }
- return 0;
+ } else
+ spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -249,17 +237,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
- /* Status of the lockres *might* change so double
- * check. If the lockres is unused, holding the dlm
- * spinlock will prevent people from getting and more
- * refs on it -- there's no need to keep the lockres
- * spinlock. */
spin_lock(&lockres->spinlock);
- unused = __dlm_lockres_unused(lockres);
- spin_unlock(&lockres->spinlock);
-
- if (!unused)
- continue;
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -271,15 +249,32 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
+ spin_unlock(&lockres->spinlock);
break;
}
+ /* Status of the lockres *might* change so double
+ * check. If the lockres is unused, holding the dlm
+ * spinlock will prevent people from getting and more
+ * refs on it. */
+ unused = __dlm_lockres_unused(lockres);
+ if (!unused ||
+ (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+ (lockres->inflight_assert_workers != 0)) {
+ mlog(0, "%s: res %.*s is in use or being remastered, "
+ "used %d, state %d, assert master workers %u\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name,
+ !unused, lockres->state,
+ lockres->inflight_assert_workers);
+ list_move_tail(&lockres->purge, &dlm->purge_list);
+ spin_unlock(&lockres->spinlock);
+ continue;
+ }
+
dlm_lockres_get(lockres);
- /* This may drop and reacquire the dlm spinlock if it
- * has to do migration. */
- if (dlm_purge_lockres(dlm, lockres))
- BUG();
+ dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
@@ -294,19 +289,15 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_lock *lock, *target;
- struct list_head *iter;
- struct list_head *head;
int can_grant = 1;
- //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
- //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
- //mlog(0, "shuffle res %.*s\n", res->lockname.len,
- // res->lockname.name);
-
- /* because this function is called with the lockres
+ /*
+ * Because this function is called with the lockres
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
- * basts right before queueing them all throughout */
+ * basts right before queueing them all throughout
+ */
+ assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
DLM_LOCK_RES_RECOVERING|
@@ -315,18 +306,16 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
converting:
if (list_empty(&res->converting))
goto blocked;
- mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
- res->lockname.name);
+ mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+ res->lockname.len, res->lockname.name);
target = list_entry(res->converting.next, struct dlm_lock, list);
if (target->ml.convert_type == LKM_IVMODE) {
- mlog(ML_ERROR, "%.*s: converting a lock with no "
- "convert_type!\n", res->lockname.len, res->lockname.name);
+ mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+ dlm->name, res->lockname.len, res->lockname.name);
BUG();
}
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -335,7 +324,7 @@ converting:
/* queue the BAST if not already */
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
/* update the highest_blocked if needed */
if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -343,9 +332,8 @@ converting:
target->ml.convert_type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -353,7 +341,7 @@ converting:
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
@@ -366,9 +354,12 @@ converting:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for converting lock: %.*s, have: %d, "
- "granting: %d, node: %u\n", res->lockname.len,
- res->lockname.name, target->ml.type,
+ mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+ "%d => %d, node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+ target->ml.type,
target->ml.convert_type, target->ml.node);
target->ml.type = target->ml.convert_type;
@@ -381,7 +372,7 @@ converting:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -391,32 +382,28 @@ blocked:
goto leave;
target = list_entry(res->blocked.next, struct dlm_lock, list);
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
@@ -429,11 +416,14 @@ blocked:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
- "node: %u\n", res->lockname.len, res->lockname.name,
+ mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
target->ml.type, target->ml.node);
- // target->ml.type is already correct
+ /* target->ml.type is already correct */
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
@@ -442,7 +432,7 @@ blocked:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -454,7 +444,6 @@ leave:
/* must have NO locks when calling this with res !=NULL * */
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
if (res) {
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -467,8 +456,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
@@ -485,13 +472,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
res->state |= DLM_LOCK_RES_DIRTY;
}
}
+
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
/* Launch the NM thread for the mounted volume */
int dlm_launch_thread(struct dlm_ctxt *dlm)
{
- mlog(0, "starting dlm thread...\n");
+ mlog(0, "Starting dlm_thread...\n");
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
if (IS_ERR(dlm->dlm_thread_task)) {
@@ -506,7 +496,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
void dlm_complete_thread(struct dlm_ctxt *dlm)
{
if (dlm->dlm_thread_task) {
- mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+ mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
kthread_stop(dlm->dlm_thread_task);
dlm->dlm_thread_task = NULL;
}
@@ -537,7 +527,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
- mlog(0, "delivering an ast for this lockres\n");
+ mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.node);
BUG_ON(!lock->ast_pending);
@@ -558,9 +553,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another ast was queued while
* we were delivering the last one */
if (!list_empty(&lock->ast_list)) {
- mlog(0, "aha another ast got queued while "
- "we were finishing the last one. will "
- "keep the ast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, AST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->ast_pending = 0;
@@ -591,8 +586,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
- mlog(0, "delivering a bast for this lockres "
- "(blocked = %d\n", hi);
+ mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+ "blocked %d, node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ hi, lock->ml.node);
if (lock->ml.node != dlm->node_num) {
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -606,9 +605,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another bast was queued while
* we were delivering the last one */
if (!list_empty(&lock->bast_list)) {
- mlog(0, "aha another bast got queued while "
- "we were finishing the last one. will "
- "keep the bast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, BAST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->bast_pending = 0;
@@ -672,14 +671,16 @@ static int dlm_thread(void *data)
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
+ spin_lock(&dlm->ast_lock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
- mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
- res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
- res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
- res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
- res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+ mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+ " dirty %d\n", dlm->name,
+ !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+ !!(res->state & DLM_LOCK_RES_MIGRATING),
+ !!(res->state & DLM_LOCK_RES_RECOVERING),
+ !!(res->state & DLM_LOCK_RES_DIRTY));
}
BUG_ON(res->owner != dlm->node_num);
@@ -692,8 +693,9 @@ static int dlm_thread(void *data)
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
- mlog(0, "delaying list shuffling for in-"
- "progress lockres %.*s, state=%d\n",
+ spin_unlock(&dlm->ast_lock);
+ mlog(0, "%s: res %.*s, inprogress, delay list "
+ "shuffle, state %d\n", dlm->name,
res->lockname.len, res->lockname.name,
res->state);
delay = 1;
@@ -705,14 +707,11 @@ static int dlm_thread(void *data)
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
- mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
- "res=%.*s\n", dlm->name,
- res->lockname.len, res->lockname.name);
-
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->ast_lock);
dlm_lockres_calc_usage(dlm, res);
@@ -731,7 +730,8 @@ in_progress:
/* unlikely, but we may need to give time to
* other tasks */
if (!--n) {
- mlog(0, "throttling dlm_thread\n");
+ mlog(0, "%s: Throttling dlm thread\n",
+ dlm->name);
break;
}
}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed693..2e3c9dbab68 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,9 +28,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
@@ -122,7 +120,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
* that still has AST's pending... */
in_use = !list_empty(&lock->ast_list);
spin_unlock(&dlm->ast_lock);
- if (in_use) {
+ if (in_use && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
"while waiting for an ast!", res->lockname.len,
res->lockname.name);
@@ -131,7 +129,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
- if (master_node) {
+ if (master_node && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres in progress!\n");
spin_unlock(&res->spinlock);
return DLM_FORWARD;
@@ -191,9 +189,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
DLM_UNLOCK_REGRANT_LOCK|
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
- } else if (status == DLM_RECOVERING ||
- status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ } else if (status == DLM_RECOVERING ||
+ status == DLM_MIGRATING ||
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR
+ ) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
@@ -202,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
- "forward"));
+ (status == DLM_FORWARD ? "forward" :
+ "nolockmanager")));
actions = 0;
}
if (flags & LKM_CANCEL)
@@ -319,7 +320,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct kvec vec[2];
size_t veclen = 1;
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
if (owner == dlm->node_num) {
/* ended up trying to contact ourself. this means
@@ -356,7 +357,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
mlog(0, "master was in-progress. retry\n");
ret = status;
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
if (dlm_is_host_down(tmpret)) {
/* NOTE: this seems strange, but it is what we want.
* when the master goes down during a cancel or
@@ -365,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
- ret = DLM_NORMAL;
+ if (dlm_is_node_dead(dlm, owner))
+ ret = DLM_NORMAL;
+ else
+ ret = DLM_NOLOCKMGR;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
@@ -389,7 +394,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
enum dlm_status status = DLM_NORMAL;
int found = 0, i;
@@ -459,8 +463,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, queue, list) {
if (lock->ml.cookie == unlock->cookie &&
lock->ml.node == unlock->node_idx) {
dlm_lock_get(lock);
@@ -589,8 +592,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
struct dlm_lock *lock = NULL;
int call_ast, is_master;
- mlog_entry_void();
-
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
@@ -643,7 +644,9 @@ retry:
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR) {
+
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
@@ -655,21 +658,21 @@ retry:
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
- "migration/in-progress\n");
+ "migration/in-progress/reconnect\n");
goto retry;
}
if (call_ast) {
mlog(0, "calling unlockast(%p, %d)\n", data, status);
if (is_master) {
- /* it is possible that there is one last bast
+ /* it is possible that there is one last bast
* pending. make sure it is flushed, then
* call the unlockast.
* not an issue if this is a mastered remotely,
* since this lock has been removed from the
* lockres queues and cannot be found. */
dlm_kick_thread(dlm, NULL);
- wait_event(dlm->ast_wq,
+ wait_event(dlm->ast_wq,
dlm_lock_basts_flushed(dlm, lock));
}
(*unlockast)(data, status);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158..00000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a1..00000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 00000000000..eed3db8c5b4
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1c9efb406a9..09b7d9dac71 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -43,24 +43,16 @@
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
+#include <linux/poll.h>
#include <asm/uaccess.h>
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
+#include "stackglue.h"
#include "userdlm.h"
-#include "dlmfsver.h"
-
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
-#include "ocfs2_lockingver.h"
static const struct super_operations dlmfs_ops;
static const struct file_operations dlmfs_file_operations;
@@ -71,15 +63,46 @@ static struct kmem_cache *dlmfs_inode_cache;
struct workqueue_struct *user_dlm_worker;
+
+
/*
- * This is the userdlmfs locking protocol version.
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI. Unfortunately, some of these features are not detectable
+ * via standard usage. For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support. Instead, we provide this list of new capabilities.
*
- * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ * Capabilities is a read-only attribute. We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
+ *
+ * The ABI features are local to this machine's dlmfs mount. This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast : POLLIN against the file descriptor of a held lock
+ * signifies a bast fired on the lock.
*/
-static const struct dlm_protocol_version user_locking_protocol = {
- .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
- .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
+#define DLMFS_CAPABILITIES "bast stackglue"
+static int param_set_dlmfs_capabilities(const char *val,
+ struct kernel_param *kp)
+{
+ printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+ return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+ struct kernel_param *kp)
+{
+ return strlcpy(buffer, DLMFS_CAPABILITIES,
+ strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+ param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
/*
* decodes a set of open flags into a valid lock level and a set of flags.
@@ -88,20 +111,20 @@ static const struct dlm_protocol_version user_locking_protocol = {
* O_RDONLY -> PRMODE level
* O_WRONLY -> EXMODE level
*
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
*/
static int dlmfs_decode_open_flags(int open_flags,
int *level,
int *flags)
{
if (open_flags & (O_WRONLY|O_RDWR))
- *level = LKM_EXMODE;
+ *level = DLM_LOCK_EX;
else
- *level = LKM_PRMODE;
+ *level = DLM_LOCK_PR;
*flags = 0;
if (open_flags & O_NONBLOCK)
- *flags |= LKM_NOQUEUE;
+ *flags |= DLM_LKF_NOQUEUE;
return 0;
}
@@ -142,7 +165,7 @@ static int dlmfs_file_open(struct inode *inode,
* to be able userspace to be able to distinguish a
* valid lock request from one that simply couldn't be
* granted. */
- if (flags & LKM_NOQUEUE && status == -EAGAIN)
+ if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
status = -ETXTBSY;
kfree(fp);
goto bail;
@@ -158,8 +181,7 @@ static int dlmfs_file_release(struct inode *inode,
{
int level, status;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
- struct dlmfs_filp_private *fp =
- (struct dlmfs_filp_private *) file->private_data;
+ struct dlmfs_filp_private *fp = file->private_data;
if (S_ISDIR(inode->i_mode))
BUG();
@@ -169,7 +191,7 @@ static int dlmfs_file_release(struct inode *inode,
status = 0;
if (fp) {
level = fp->fp_lock_level;
- if (level != LKM_IVMODE)
+ if (level != DLM_LOCK_IV)
user_dlm_cluster_unlock(&ip->ip_lockres, level);
kfree(fp);
@@ -179,15 +201,50 @@ static int dlmfs_file_release(struct inode *inode,
return 0;
}
+/*
+ * We do ->setattr() just to override size changes. Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int error;
+ struct inode *inode = dentry->d_inode;
+
+ attr->ia_valid &= ~ATTR_SIZE;
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+ int event = 0;
+ struct inode *inode = file_inode(file);
+ struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+ poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+ spin_lock(&ip->ip_lockres.l_lock);
+ if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+ event = POLLIN | POLLRDNORM;
+ spin_unlock(&ip->ip_lockres.l_lock);
+
+ return event;
+}
+
static ssize_t dlmfs_file_read(struct file *filp,
char __user *buf,
size_t count,
loff_t *ppos)
{
int bytes_left;
- ssize_t readlen;
+ ssize_t readlen, got;
char *lvb_buf;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -205,15 +262,19 @@ static ssize_t dlmfs_file_read(struct file *filp,
if ((count + *ppos) > i_size_read(inode))
readlen = i_size_read(inode) - *ppos;
else
- readlen = count - *ppos;
+ readlen = count;
lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
- user_dlm_read_lvb(inode, lvb_buf, readlen);
- bytes_left = __copy_to_user(buf, lvb_buf, readlen);
- readlen -= bytes_left;
+ got = user_dlm_read_lvb(inode, lvb_buf, readlen);
+ if (got) {
+ BUG_ON(got != readlen);
+ bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+ readlen -= bytes_left;
+ } else
+ readlen = 0;
kfree(lvb_buf);
@@ -231,7 +292,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
int bytes_left;
ssize_t writelen;
char *lvb_buf;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -272,7 +333,7 @@ static void dlmfs_init_once(void *foo)
struct dlmfs_inode_private *ip =
(struct dlmfs_inode_private *) foo;
- ip->ip_dlm = NULL;
+ ip->ip_conn = NULL;
ip->ip_parent = NULL;
inode_init_once(&ip->ip_vfs_inode);
@@ -289,18 +350,23 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
return &ip->ip_vfs_inode;
}
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
-static void dlmfs_clear_inode(struct inode *inode)
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
+static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
- if (!inode)
- return;
+ clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
@@ -314,17 +380,18 @@ static void dlmfs_clear_inode(struct inode *inode)
goto clear_fields;
}
- mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+ mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
/* we must be a directory. If required, lets unregister the
* dlm context now. */
- if (ip->ip_dlm)
- user_dlm_unregister_context(ip->ip_dlm);
+ if (ip->ip_conn)
+ user_dlm_unregister(ip->ip_conn);
clear_fields:
ip->ip_parent = NULL;
- ip->ip_dlm = NULL;
+ ip->ip_conn = NULL;
}
static struct backing_dev_info dlmfs_backing_dev_info = {
+ .name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
@@ -332,15 +399,11 @@ static struct backing_dev_info dlmfs_backing_dev_info = {
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
- int mode = S_IFDIR | 0755;
- struct dlmfs_inode_private *ip;
+ umode_t mode = S_IFDIR | 0755;
if (inode) {
- ip = DLMFS_I(inode);
-
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, NULL, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -354,7 +417,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
static struct inode *dlmfs_get_inode(struct inode *parent,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
struct super_block *sb = parent->i_sb;
struct inode * inode = new_inode(sb);
@@ -363,14 +426,13 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
if (!inode)
return NULL;
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, parent, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ip = DLMFS_I(inode);
- ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+ ip->ip_conn = DLMFS_I(parent)->ip_conn;
switch (mode & S_IFMT) {
default:
@@ -402,13 +464,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inc_nlink(inode);
break;
}
-
- if (parent->i_mode & S_ISGID) {
- inode->i_gid = parent->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
-
return inode;
}
@@ -418,19 +473,18 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
/* SMP-safe */
static int dlmfs_mkdir(struct inode * dir,
struct dentry * dentry,
- int mode)
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
struct qstr *domain = &dentry->d_name;
struct dlmfs_inode_private *ip;
- struct dlm_ctxt *dlm;
- struct dlm_protocol_version proto = user_locking_protocol;
+ struct ocfs2_cluster_connection *conn;
mlog(0, "mkdir %.*s\n", domain->len, domain->name);
/* verify that we have a proper domain */
- if (domain->len >= O2NM_MAX_NAME_LEN) {
+ if (domain->len >= GROUP_NAME_MAX) {
status = -EINVAL;
mlog(ML_ERROR, "invalid domain name for directory.\n");
goto bail;
@@ -445,14 +499,14 @@ static int dlmfs_mkdir(struct inode * dir,
ip = DLMFS_I(inode);
- dlm = user_dlm_register_context(domain, &proto);
- if (IS_ERR(dlm)) {
- status = PTR_ERR(dlm);
+ conn = user_dlm_register(domain);
+ if (IS_ERR(conn)) {
+ status = PTR_ERR(conn);
mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
status, domain->len, domain->name);
goto bail;
}
- ip->ip_dlm = dlm;
+ ip->ip_conn = conn;
inc_nlink(dir);
d_instantiate(dentry, inode);
@@ -467,8 +521,8 @@ bail:
static int dlmfs_create(struct inode *dir,
struct dentry *dentry,
- int mode,
- struct nameidata *nd)
+ umode_t mode,
+ bool excl)
{
int status = 0;
struct inode *inode;
@@ -524,32 +578,24 @@ static int dlmfs_fill_super(struct super_block * sb,
void * data,
int silent)
{
- struct inode * inode;
- struct dentry * root;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
- inode = dlmfs_get_root_inode(sb);
- if (!inode)
- return -ENOMEM;
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
+ sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
+ if (!sb->s_root)
return -ENOMEM;
- }
- sb->s_root = root;
return 0;
}
static const struct file_operations dlmfs_file_operations = {
.open = dlmfs_file_open,
.release = dlmfs_file_release,
+ .poll = dlmfs_file_poll,
.read = dlmfs_file_read,
.write = dlmfs_file_write,
+ .llseek = default_llseek,
};
static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -569,34 +615,34 @@ static const struct super_operations dlmfs_ops = {
.statfs = simple_statfs,
.alloc_inode = dlmfs_alloc_inode,
.destroy_inode = dlmfs_destroy_inode,
- .clear_inode = dlmfs_clear_inode,
+ .evict_inode = dlmfs_evict_inode,
.drop_inode = generic_delete_inode,
};
static const struct inode_operations dlmfs_file_inode_operations = {
.getattr = simple_getattr,
+ .setattr = dlmfs_file_setattr,
};
-static int dlmfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
}
static struct file_system_type dlmfs_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2_dlmfs",
- .get_sb = dlmfs_get_sb,
+ .mount = dlmfs_mount,
.kill_sb = kill_litter_super,
};
+MODULE_ALIAS_FS("ocfs2_dlmfs");
static int __init init_dlmfs_fs(void)
{
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- dlmfs_print_version();
-
status = bdi_init(&dlmfs_backing_dev_info);
if (status)
return status;
@@ -619,6 +665,7 @@ static int __init init_dlmfs_fs(void)
}
cleanup_worker = 1;
+ user_dlm_set_locking_protocol();
status = register_filesystem(&dlmfs_fs_type);
bail:
if (status) {
@@ -639,6 +686,11 @@ static void __exit exit_dlmfs_fs(void)
flush_workqueue(user_dlm_worker);
destroy_workqueue(user_dlm_worker);
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
bdi_destroy(&dlmfs_backing_dev_info);
@@ -646,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 4cb1d3dae25..0499e3fb7bd 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
#include <linux/types.h>
#include <linux/crc32.h>
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
+#include "ocfs2_lockingver.h"
+#include "stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
+
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+ return container_of(lksb, struct user_lock_res, l_lksb);
+}
+
static inline int user_check_wait_flag(struct user_lock_res *lockres,
int flag)
{
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
}
/* I heart container_of... */
-static inline struct dlm_ctxt *
-dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+static inline struct ocfs2_cluster_connection *
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
{
struct dlmfs_inode_private *ip;
ip = container_of(lockres,
struct dlmfs_inode_private,
ip_lockres);
- return ip->ip_dlm;
+ return ip->ip_conn;
}
static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
}
#define user_log_dlm_error(_func, _stat, _lockres) do { \
- mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
- "resource %.*s: %s\n", dlm_errname(_stat), _func, \
- _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
+ mlog(ML_ERROR, "Dlm error %d while calling %s on " \
+ "resource %.*s\n", _stat, _func, \
+ _lockres->l_namelen, _lockres->l_name); \
} while (0)
/* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,35 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
* lock types are added. */
static inline int user_highest_compat_lock_level(int level)
{
- int new_level = LKM_EXMODE;
+ int new_level = DLM_LOCK_EX;
- if (level == LKM_EXMODE)
- new_level = LKM_NLMODE;
- else if (level == LKM_PRMODE)
- new_level = LKM_PRMODE;
+ if (level == DLM_LOCK_EX)
+ new_level = DLM_LOCK_NL;
+ else if (level == DLM_LOCK_PR)
+ new_level = DLM_LOCK_PR;
return new_level;
}
-static void user_ast(void *opaque)
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
{
- struct user_lock_res *lockres = opaque;
- struct dlm_lockstatus *lksb;
+ struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+ int status;
- mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
- lockres->l_name);
+ mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
+ lockres->l_namelen, lockres->l_name, lockres->l_level,
+ lockres->l_requested);
spin_lock(&lockres->l_lock);
- lksb = &(lockres->l_lksb);
- if (lksb->status != DLM_NORMAL) {
+ status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+ if (status) {
mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
- lksb->status, lockres->l_namelen, lockres->l_name);
+ status, lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
return;
}
- mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+ mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
"Lockres %.*s, requested ivmode. flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
@@ -148,13 +150,13 @@ static void user_ast(void *opaque)
if (lockres->l_requested < lockres->l_level) {
if (lockres->l_requested <=
user_highest_compat_lock_level(lockres->l_blocking)) {
- lockres->l_blocking = LKM_NLMODE;
+ lockres->l_blocking = DLM_LOCK_NL;
lockres->l_flags &= ~USER_LOCK_BLOCKED;
}
}
lockres->l_level = lockres->l_requested;
- lockres->l_requested = LKM_IVMODE;
+ lockres->l_requested = DLM_LOCK_IV;
lockres->l_flags |= USER_LOCK_ATTACHED;
lockres->l_flags &= ~USER_LOCK_BUSY;
@@ -193,11 +195,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
return;
switch (lockres->l_blocking) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
queue = 1;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
if (!lockres->l_ex_holders)
queue = 1;
break;
@@ -209,12 +211,12 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
__user_dlm_queue_lockres(lockres);
}
-static void user_bast(void *opaque, int level)
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
{
- struct user_lock_res *lockres = opaque;
+ struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
- mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
- lockres->l_namelen, lockres->l_name, level);
+ mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
+ lockres->l_namelen, lockres->l_name, level, lockres->l_level);
spin_lock(&lockres->l_lock);
lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -227,15 +229,15 @@ static void user_bast(void *opaque, int level)
wake_up(&lockres->l_event);
}
-static void user_unlock_ast(void *opaque, enum dlm_status status)
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
{
- struct user_lock_res *lockres = opaque;
+ struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
- mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
- lockres->l_name);
+ mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
+ lockres->l_namelen, lockres->l_name, lockres->l_flags);
- if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
- mlog(ML_ERROR, "Dlm returns status %d\n", status);
+ if (status)
+ mlog(ML_ERROR, "dlm returns status %d\n", status);
spin_lock(&lockres->l_lock);
/* The teardown flag gets set early during the unlock process,
@@ -243,7 +245,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
* for a concurrent cancel. */
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
&& !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
- lockres->l_level = LKM_IVMODE;
+ lockres->l_level = DLM_LOCK_IV;
} else if (status == DLM_CANCELGRANT) {
/* We tried to cancel a convert request, but it was
* already granted. Don't clear the busy flag - the
@@ -254,7 +256,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
} else {
BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
/* Cancel succeeded, we want to re-queue */
- lockres->l_requested = LKM_IVMODE; /* cancel an
+ lockres->l_requested = DLM_LOCK_IV; /* cancel an
* upconvert
* request. */
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +273,21 @@ out_noclear:
wake_up(&lockres->l_event);
}
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+ .lp_max_version = {
+ .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+ .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+ },
+ .lp_lock_ast = user_ast,
+ .lp_blocking_ast = user_bast,
+ .lp_unlock_ast = user_unlock_ast,
+};
+
static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
{
struct inode *inode;
@@ -283,10 +300,10 @@ static void user_dlm_unblock_lock(struct work_struct *work)
int new_level, status;
struct user_lock_res *lockres =
container_of(work, struct user_lock_res, l_work);
- struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+ struct ocfs2_cluster_connection *conn =
+ cluster_connection_from_user_lockres(lockres);
- mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
- lockres->l_name);
+ mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
@@ -304,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
* flag, and finally we might get another bast which re-queues
* us before our ast for the downconvert is called. */
if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+ mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+ lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+ lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_BUSY) {
if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+ mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+ lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
@@ -322,32 +345,31 @@ static void user_dlm_unblock_lock(struct work_struct *work)
lockres->l_flags |= USER_LOCK_IN_CANCEL;
spin_unlock(&lockres->l_lock);
- status = dlmunlock(dlm,
- &lockres->l_lksb,
- LKM_CANCEL,
- user_unlock_ast,
- lockres);
- if (status != DLM_NORMAL)
- user_log_dlm_error("dlmunlock", status, lockres);
+ status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
+ DLM_LKF_CANCEL);
+ if (status)
+ user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto drop_ref;
}
/* If there are still incompat holders, we can exit safely
* without worrying about re-queueing this lock as that will
* happen on the last call to user_cluster_unlock. */
- if ((lockres->l_blocking == LKM_EXMODE)
+ if ((lockres->l_blocking == DLM_LOCK_EX)
&& (lockres->l_ex_holders || lockres->l_ro_holders)) {
spin_unlock(&lockres->l_lock);
- mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
- lockres->l_ro_holders, lockres->l_ex_holders);
+ mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
+ lockres->l_namelen, lockres->l_name,
+ lockres->l_ex_holders, lockres->l_ro_holders);
goto drop_ref;
}
- if ((lockres->l_blocking == LKM_PRMODE)
+ if ((lockres->l_blocking == DLM_LOCK_PR)
&& lockres->l_ex_holders) {
spin_unlock(&lockres->l_lock);
- mlog(0, "can't downconvert for pr: ex = %u\n",
- lockres->l_ex_holders);
+ mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
+ lockres->l_namelen, lockres->l_name,
+ lockres->l_ex_holders);
goto drop_ref;
}
@@ -355,22 +377,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
new_level = user_highest_compat_lock_level(lockres->l_blocking);
lockres->l_requested = new_level;
lockres->l_flags |= USER_LOCK_BUSY;
- mlog(0, "Downconvert lock from %d to %d\n",
- lockres->l_level, new_level);
+ mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
+ lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
spin_unlock(&lockres->l_lock);
/* need lock downconvert request now... */
- status = dlmlock(dlm,
- new_level,
- &lockres->l_lksb,
- LKM_CONVERT|LKM_VALBLK,
- lockres->l_name,
- lockres->l_namelen,
- user_ast,
- lockres,
- user_bast);
- if (status != DLM_NORMAL) {
- user_log_dlm_error("dlmlock", status, lockres);
+ status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
+ DLM_LKF_CONVERT|DLM_LKF_VALBLK,
+ lockres->l_name,
+ lockres->l_namelen);
+ if (status) {
+ user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
user_recover_from_dlm_error(lockres);
}
@@ -382,10 +399,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
lockres->l_ex_holders++;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
lockres->l_ro_holders++;
break;
default:
@@ -410,20 +427,19 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
int lkm_flags)
{
int status, local_flags;
- struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+ struct ocfs2_cluster_connection *conn =
+ cluster_connection_from_user_lockres(lockres);
- if (level != LKM_EXMODE &&
- level != LKM_PRMODE) {
+ if (level != DLM_LOCK_EX &&
+ level != DLM_LOCK_PR) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
status = -EINVAL;
goto bail;
}
- mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
- lockres->l_namelen, lockres->l_name,
- (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
- lkm_flags);
+ mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
+ lockres->l_namelen, lockres->l_name, level, lkm_flags);
again:
if (signal_pending(current)) {
@@ -457,35 +473,26 @@ again:
}
if (level > lockres->l_level) {
- local_flags = lkm_flags | LKM_VALBLK;
- if (lockres->l_level != LKM_IVMODE)
- local_flags |= LKM_CONVERT;
+ local_flags = lkm_flags | DLM_LKF_VALBLK;
+ if (lockres->l_level != DLM_LOCK_IV)
+ local_flags |= DLM_LKF_CONVERT;
lockres->l_requested = level;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
- BUG_ON(level == LKM_IVMODE);
- BUG_ON(level == LKM_NLMODE);
+ BUG_ON(level == DLM_LOCK_IV);
+ BUG_ON(level == DLM_LOCK_NL);
/* call dlm_lock to upgrade lock now */
- status = dlmlock(dlm,
- level,
- &lockres->l_lksb,
- local_flags,
- lockres->l_name,
- lockres->l_namelen,
- user_ast,
- lockres,
- user_bast);
- if (status != DLM_NORMAL) {
- if ((lkm_flags & LKM_NOQUEUE) &&
- (status == DLM_NOTQUEUED))
- status = -EAGAIN;
- else {
- user_log_dlm_error("dlmlock", status, lockres);
- status = -EINVAL;
- }
+ status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
+ local_flags, lockres->l_name,
+ lockres->l_namelen);
+ if (status) {
+ if ((lkm_flags & DLM_LKF_NOQUEUE) &&
+ (status != -EAGAIN))
+ user_log_dlm_error("ocfs2_dlm_lock",
+ status, lockres);
user_recover_from_dlm_error(lockres);
goto bail;
}
@@ -506,11 +513,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
- case LKM_EXMODE:
+ case DLM_LOCK_EX:
BUG_ON(!lockres->l_ex_holders);
lockres->l_ex_holders--;
break;
- case LKM_PRMODE:
+ case DLM_LOCK_PR:
BUG_ON(!lockres->l_ro_holders);
lockres->l_ro_holders--;
break;
@@ -522,8 +529,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
int level)
{
- if (level != LKM_EXMODE &&
- level != LKM_PRMODE) {
+ if (level != DLM_LOCK_EX &&
+ level != DLM_LOCK_PR) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
return;
@@ -540,33 +547,40 @@ void user_dlm_write_lvb(struct inode *inode,
unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
- char *lvb = lockres->l_lksb.lvb;
+ char *lvb;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
- BUG_ON(lockres->l_level < LKM_EXMODE);
+ BUG_ON(lockres->l_level < DLM_LOCK_EX);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
memcpy(lvb, val, len);
spin_unlock(&lockres->l_lock);
}
-void user_dlm_read_lvb(struct inode *inode,
- char *val,
- unsigned int len)
+ssize_t user_dlm_read_lvb(struct inode *inode,
+ char *val,
+ unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
- char *lvb = lockres->l_lksb.lvb;
+ char *lvb;
+ ssize_t ret = len;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
- BUG_ON(lockres->l_level < LKM_PRMODE);
- memcpy(val, lvb, len);
+ BUG_ON(lockres->l_level < DLM_LOCK_PR);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ memcpy(val, lvb, len);
+ } else
+ ret = 0;
spin_unlock(&lockres->l_lock);
+ return ret;
}
void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +590,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
spin_lock_init(&lockres->l_lock);
init_waitqueue_head(&lockres->l_event);
- lockres->l_level = LKM_IVMODE;
- lockres->l_requested = LKM_IVMODE;
- lockres->l_blocking = LKM_IVMODE;
+ lockres->l_level = DLM_LOCK_IV;
+ lockres->l_requested = DLM_LOCK_IV;
+ lockres->l_blocking = DLM_LOCK_IV;
/* should have been checked before getting here. */
BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,9 +606,10 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
int user_dlm_destroy_lock(struct user_lock_res *lockres)
{
int status = -EBUSY;
- struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+ struct ocfs2_cluster_connection *conn =
+ cluster_connection_from_user_lockres(lockres);
- mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
+ mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
@@ -627,14 +642,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
- status = dlmunlock(dlm,
- &lockres->l_lksb,
- LKM_VALBLK,
- user_unlock_ast,
- lockres);
- if (status != DLM_NORMAL) {
- user_log_dlm_error("dlmunlock", status, lockres);
- status = -EINVAL;
+ status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
+ if (status) {
+ user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
@@ -645,32 +655,34 @@ bail:
return status;
}
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
- struct dlm_protocol_version *proto)
+static void user_dlm_recovery_handler_noop(int node_num,
+ void *recovery_data)
{
- struct dlm_ctxt *dlm;
- u32 dlm_key;
- char *domain;
-
- domain = kmalloc(name->len + 1, GFP_NOFS);
- if (!domain) {
- mlog_errno(-ENOMEM);
- return ERR_PTR(-ENOMEM);
- }
+ /* We ignore recovery events */
+ return;
+}
- dlm_key = crc32_le(0, name->name, name->len);
+void user_dlm_set_locking_protocol(void)
+{
+ ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
- snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+ int rc;
+ struct ocfs2_cluster_connection *conn;
- dlm = dlm_register_domain(domain, dlm_key, proto);
- if (IS_ERR(dlm))
- mlog_errno(PTR_ERR(dlm));
+ rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
+ &user_dlm_lproto,
+ user_dlm_recovery_handler_noop,
+ NULL, &conn);
+ if (rc)
+ mlog_errno(rc);
- kfree(domain);
- return dlm;
+ return rc ? ERR_PTR(rc) : conn;
}
-void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
{
- dlm_unregister_domain(dlm);
+ ocfs2_cluster_disconnect(conn, 0);
}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61f..3b42d79531d 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct dlm_lockstatus l_lksb;
+ struct ocfs2_dlm_lksb l_lksb;
int l_requested;
int l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len);
-void user_dlm_read_lvb(struct inode *inode,
- char *val,
- unsigned int len);
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
- struct dlm_protocol_version *proto);
-void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+ssize_t user_dlm_read_lvb(struct inode *inode,
+ char *val,
+ unsigned int len);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
+void user_dlm_set_locking_protocol(void);
struct dlmfs_inode_private {
- struct dlm_ctxt *ip_dlm;
+ struct ocfs2_cluster_connection *ip_conn;
struct user_lock_res ip_lockres; /* unused for directories. */
struct inode *ip_parent;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d5082..52cfe99ae05 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
#include "super.h"
#include "uptodate.h"
#include "quota.h"
+#include "refcounttree.h"
#include "buffer_head_io.h"
@@ -63,7 +64,7 @@ struct ocfs2_mask_waiter {
unsigned long mw_mask;
unsigned long mw_goal;
#ifdef CONFIG_OCFS2_FS_STATS
- unsigned long long mw_lock_start;
+ ktime_t mw_lock_start;
#endif
};
@@ -92,6 +93,9 @@ struct ocfs2_unblock_ctl {
enum ocfs2_unblock_action unblock_action;
};
+/* Lockdep class keys */
+struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
@@ -107,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+ int blocking);
+
#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
/* This aids in debugging situations where a bad LVB might be involved. */
@@ -248,6 +257,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+ .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -271,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
.flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
};
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+ .check_downconvert = ocfs2_check_refcount_downconvert,
+ .downconvert_worker = ocfs2_refcount_convert_worker,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -278,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+ return container_of(lksb, struct ocfs2_lock_res, l_lksb);
+}
+
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
{
BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -299,6 +323,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re
return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
}
+static inline struct ocfs2_refcount_tree *
+ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
+{
+ return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
{
if (lockres->l_ops->get_osb)
@@ -313,9 +343,16 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
u32 dlm_flags);
static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
int wanted);
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int level);
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level, unsigned long caller_ip);
+static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level)
+{
+ __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
+}
+
static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
@@ -360,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
{
int len;
- mlog_entry_void();
-
BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -371,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
mlog(0, "built lock resource with name: %s\n", name);
-
- mlog_exit_void();
}
static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -398,44 +431,41 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
#ifdef CONFIG_OCFS2_FS_STATS
static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
{
- res->l_lock_num_prmode = 0;
- res->l_lock_num_prmode_failed = 0;
- res->l_lock_total_prmode = 0;
- res->l_lock_max_prmode = 0;
- res->l_lock_num_exmode = 0;
- res->l_lock_num_exmode_failed = 0;
- res->l_lock_total_exmode = 0;
- res->l_lock_max_exmode = 0;
res->l_lock_refresh = 0;
+ memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+ memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
}
static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
struct ocfs2_mask_waiter *mw, int ret)
{
- unsigned long long *num, *sum;
- unsigned int *max, *failed;
- struct timespec ts = current_kernel_time();
- unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
-
- if (level == LKM_PRMODE) {
- num = &res->l_lock_num_prmode;
- sum = &res->l_lock_total_prmode;
- max = &res->l_lock_max_prmode;
- failed = &res->l_lock_num_prmode_failed;
- } else if (level == LKM_EXMODE) {
- num = &res->l_lock_num_exmode;
- sum = &res->l_lock_total_exmode;
- max = &res->l_lock_max_exmode;
- failed = &res->l_lock_num_exmode_failed;
- } else
+ u32 usec;
+ ktime_t kt;
+ struct ocfs2_lock_stats *stats;
+
+ if (level == LKM_PRMODE)
+ stats = &res->l_lock_prmode;
+ else if (level == LKM_EXMODE)
+ stats = &res->l_lock_exmode;
+ else
return;
- (*num)++;
- (*sum) += time;
- if (time > *max)
- *max = time;
+ kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+ usec = ktime_to_us(kt);
+
+ stats->ls_gets++;
+ stats->ls_total += ktime_to_ns(kt);
+ /* overflow */
+ if (unlikely(stats->ls_gets == 0)) {
+ stats->ls_gets++;
+ stats->ls_total = ktime_to_ns(kt);
+ }
+
+ if (stats->ls_max < usec)
+ stats->ls_max = usec;
+
if (ret)
- (*failed)++;
+ stats->ls_fail++;
}
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -445,8 +475,7 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
{
- struct timespec ts = current_kernel_time();
- mw->mw_lock_start = timespec_to_ns(&ts);
+ mw->mw_lock_start = ktime_get();
}
#else
static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
@@ -485,6 +514,13 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
ocfs2_init_lock_stats(res);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (type != OCFS2_LOCK_TYPE_OPEN)
+ lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
+ &lockdep_keys[type], 0);
+ else
+ res->l_lockdep_map.key = NULL;
+#endif
}
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -637,6 +673,15 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ &ocfs2_orphan_scan_lops, osb);
+}
+
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
@@ -663,10 +708,19 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
info);
}
-void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_super *osb, u64 ref_blkno,
+ unsigned int generation)
{
- mlog_entry_void();
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+ generation, lockres->l_name);
+ ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+ &ocfs2_refcount_block_lops, osb);
+}
+void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
+{
if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
return;
@@ -692,14 +746,11 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
memset(&res->l_lksb, 0, sizeof(res->l_lksb));
res->l_flags = 0UL;
- mlog_exit_void();
}
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
- mlog_entry_void();
-
BUG_ON(!lockres);
switch(level) {
@@ -712,15 +763,11 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
default:
BUG();
}
-
- mlog_exit_void();
}
static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
int level)
{
- mlog_entry_void();
-
BUG_ON(!lockres);
switch(level) {
@@ -735,7 +782,6 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
default:
BUG();
}
- mlog_exit_void();
}
/* WARNING: This function lives in a world where the only three lock
@@ -782,8 +828,6 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
@@ -796,14 +840,10 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
}
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
@@ -816,15 +856,19 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
lockres->l_level = lockres->l_requested;
- lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
- mlog_exit_void();
+ /*
+ * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
+ * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
+ * downconverting the lock before the upconvert has fully completed.
+ */
+ lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
+ lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
}
static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -836,20 +880,15 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
lockres->l_level = lockres->l_requested;
lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
int level)
{
int needs_downconvert = 0;
- mlog_entry_void();
assert_spin_locked(&lockres->l_lock);
- lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-
if (level > lockres->l_blocking) {
/* only schedule a downconvert if we haven't already scheduled
* one that goes low enough to satisfy the level we're
@@ -862,7 +901,13 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
lockres->l_blocking = level;
}
- mlog_exit(needs_downconvert);
+ mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+ lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+ needs_downconvert);
+
+ if (needs_downconvert)
+ lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+ mlog(0, "needs_downconvert = %d\n", needs_downconvert);
return needs_downconvert;
}
@@ -972,18 +1017,17 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
return lockres->l_pending_gen;
}
-
-static void ocfs2_blocking_ast(void *opaque, int level)
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
{
- struct ocfs2_lock_res *lockres = opaque;
+ struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
int needs_downconvert;
unsigned long flags;
BUG_ON(level <= DLM_LOCK_NL);
- mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
- lockres->l_name, level, lockres->l_level,
+ mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
+ "type %s\n", lockres->l_name, level, lockres->l_level,
ocfs2_lock_type_string(lockres->l_type));
/*
@@ -1004,9 +1048,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
ocfs2_wake_downconvert_thread(osb);
}
-static void ocfs2_locking_ast(void *opaque)
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
{
- struct ocfs2_lock_res *lockres = opaque;
+ struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
unsigned long flags;
int status;
@@ -1027,6 +1071,10 @@ static void ocfs2_locking_ast(void *opaque)
return;
}
+ mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+ "level %d => %d\n", lockres->l_name, lockres->l_action,
+ lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
+
switch(lockres->l_action) {
case OCFS2_AST_ATTACH:
ocfs2_generic_handle_attach_action(lockres);
@@ -1039,8 +1087,8 @@ static void ocfs2_locking_ast(void *opaque)
ocfs2_generic_handle_downconvert_action(lockres);
break;
default:
- mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
- "lockres flags = 0x%lx, unlock action: %u\n",
+ mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
+ "flags 0x%lx, unlock: %u\n",
lockres->l_name, lockres->l_action, lockres->l_flags,
lockres->l_unlock_action);
BUG();
@@ -1066,14 +1114,91 @@ out:
spin_unlock_irqrestore(&lockres->l_lock, flags);
}
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+ struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+ unsigned long flags;
+
+ mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+ lockres->l_name, lockres->l_unlock_action);
+
+ spin_lock_irqsave(&lockres->l_lock, flags);
+ if (error) {
+ mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+ "unlock_action %d\n", error, lockres->l_name,
+ lockres->l_unlock_action);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ return;
+ }
+
+ switch(lockres->l_unlock_action) {
+ case OCFS2_UNLOCK_CANCEL_CONVERT:
+ mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+ lockres->l_action = OCFS2_AST_INVALID;
+ /* Downconvert thread may have requeued this lock, we
+ * need to wake it. */
+ if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+ ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+ break;
+ case OCFS2_UNLOCK_DROP_LOCK:
+ lockres->l_level = DLM_LOCK_IV;
+ break;
+ default:
+ BUG();
+ }
+
+ lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+ lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+ wake_up(&lockres->l_event);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+/*
+ * This is the filesystem locking protocol. It provides the lock handling
+ * hooks for the underlying DLM. It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed. The protocol is negotiated when joining
+ * the dlm domain. A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes. When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero. If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased. If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+ .lp_max_version = {
+ .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+ .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+ },
+ .lp_lock_ast = ocfs2_locking_ast,
+ .lp_blocking_ast = ocfs2_blocking_ast,
+ .lp_unlock_ast = ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+ ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
+
static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
int convert)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+ lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
if (convert)
lockres->l_action = OCFS2_AST_INVALID;
else
@@ -1081,7 +1206,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
spin_unlock_irqrestore(&lockres->l_lock, flags);
wake_up(&lockres->l_event);
- mlog_exit_void();
}
/* Note: If we detect another process working on the lock (i.e.,
@@ -1097,8 +1221,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
unsigned long flags;
unsigned int gen;
- mlog_entry_void();
-
mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
dlm_flags);
@@ -1120,8 +1242,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
&lockres->l_lksb,
dlm_flags,
lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- lockres);
+ OCFS2_LOCK_ID_MAX_LEN - 1);
lockres_clear_pending(lockres, gen, osb);
if (ret) {
ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1131,7 +1252,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
bail:
- mlog_exit(ret);
return ret;
}
@@ -1184,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
{
wait_for_completion(&mw->mw_complete);
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return mw->mw_status;
}
@@ -1235,15 +1355,17 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
else
ret = mw->mw_status;
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return ret;
}
-static int ocfs2_cluster_lock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int level,
- u32 lkm_flags,
- int arg_flags)
+static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level,
+ u32 lkm_flags,
+ int arg_flags,
+ int l_subclass,
+ unsigned long caller_ip)
{
struct ocfs2_mask_waiter mw;
int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
@@ -1252,8 +1374,6 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned int gen;
int noqueue_attempted = 0;
- mlog_entry_void();
-
ocfs2_init_mask_waiter(&mw);
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -1262,13 +1382,13 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
again:
wait = 0;
+ spin_lock_irqsave(&lockres->l_lock, flags);
+
if (catch_signals && signal_pending(current)) {
ret = -ERESTARTSYS;
- goto out;
+ goto unlock;
}
- spin_lock_irqsave(&lockres->l_lock, flags);
-
mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
"Cluster lock called on freeing lockres %s! flags "
"0x%lx\n", lockres->l_name, lockres->l_flags);
@@ -1285,6 +1405,25 @@ again:
goto unlock;
}
+ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
+ /*
+ * We've upconverted. If the lock now has a level we can
+ * work with, we take it. If, however, the lock is not at the
+ * required level, we go thru the full cycle. One way this could
+ * happen is if a process requesting an upconvert to PR is
+ * closely followed by another requesting upconvert to an EX.
+ * If the process requesting EX lands here, we want it to
+ * continue attempting to upconvert and let the process
+ * requesting PR take the lock.
+ * If multiple processes request upconvert to PR, the first one
+ * here will take the lock. The others will have to go thru the
+ * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
+ * downconvert request.
+ */
+ if (level <= lockres->l_level)
+ goto update_holders;
+ }
+
if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
!ocfs2_may_continue_on_blocked_lock(lockres, level)) {
/* is the lock is currently blocked on behalf of
@@ -1322,7 +1461,7 @@ again:
BUG_ON(level == DLM_LOCK_IV);
BUG_ON(level == DLM_LOCK_NL);
- mlog(0, "lock %s, convert from %d to level = %d\n",
+ mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
lockres->l_name, lockres->l_level, level);
/* call dlm_lock to upgrade lock now */
@@ -1331,8 +1470,7 @@ again:
&lockres->l_lksb,
lkm_flags,
lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- lockres);
+ OCFS2_LOCK_ID_MAX_LEN - 1);
lockres_clear_pending(lockres, gen, osb);
if (ret) {
if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1355,11 +1493,14 @@ again:
goto again;
}
+update_holders:
/* Ok, if we get here then we're good to go. */
ocfs2_inc_holders(lockres, level);
ret = 0;
unlock:
+ lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
out:
/*
@@ -1386,22 +1527,47 @@ out:
}
ocfs2_update_lock_stats(lockres, level, &mw, ret);
- mlog_exit(ret);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!ret && lockres->l_lockdep_map.key != NULL) {
+ if (level == DLM_LOCK_PR)
+ rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
+ !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+ caller_ip);
+ else
+ rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
+ !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
+ caller_ip);
+ }
+#endif
return ret;
}
-static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
- struct ocfs2_lock_res *lockres,
- int level)
+static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level,
+ u32 lkm_flags,
+ int arg_flags)
+{
+ return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
+ 0, _RET_IP_);
+}
+
+
+static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres,
+ int level,
+ unsigned long caller_ip)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
ocfs2_dec_holders(lockres, level);
ocfs2_downconvert_on_unlock(osb, lockres);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- mlog_exit_void();
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (lockres->l_lockdep_map.key != NULL)
+ rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+#endif
}
static int ocfs2_create_new_lock(struct ocfs2_super *osb,
@@ -1435,8 +1601,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
BUG_ON(!inode);
BUG_ON(!ocfs2_inode_is_new(inode));
- mlog_entry_void();
-
mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
/* NOTE: That we don't increment any of the holder counts, nor
@@ -1470,7 +1634,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
bail:
- mlog_exit(ret);
return ret;
}
@@ -1482,8 +1645,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu take %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
@@ -1500,7 +1661,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
if (status < 0)
mlog_errno(status);
- mlog_exit(status);
return status;
}
@@ -1510,16 +1670,12 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
mlog(0, "inode %llu drop %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
- mlog_exit_void();
}
/*
@@ -1533,12 +1689,10 @@ int ocfs2_open_lock(struct inode *inode)
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (ocfs2_mount_local(osb))
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1549,7 +1703,6 @@ int ocfs2_open_lock(struct inode *inode)
mlog_errno(status);
out:
- mlog_exit(status);
return status;
}
@@ -1561,12 +1714,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (write)
+ status = -EROFS;
+ goto out;
+ }
+
if (ocfs2_mount_local(osb))
goto out;
@@ -1584,7 +1741,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
level, DLM_LKF_NOQUEUE, 0);
out:
- mlog_exit(status);
return status;
}
@@ -1596,8 +1752,6 @@ void ocfs2_open_unlock(struct inode *inode)
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
mlog(0, "inode %llu drop open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1612,7 +1766,7 @@ void ocfs2_open_unlock(struct inode *inode)
DLM_LOCK_EX);
out:
- mlog_exit_void();
+ return;
}
static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
@@ -1666,7 +1820,7 @@ out:
* ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
* flock() calls. The locking approach this requires is sufficiently
* different from all other cluster lock types that we implement a
- * seperate path to the "low-level" dlm calls. In particular:
+ * separate path to the "low-level" dlm calls. In particular:
*
* - No optimization of lock levels is done - we take at exactly
* what's been requested.
@@ -1736,8 +1890,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
- lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
- lockres);
+ lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
if (ret) {
if (!trylock || (ret != -EAGAIN)) {
ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1764,7 +1917,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
* outstanding lock request, so a cancel convert is
* required. We intentionally overwrite 'ret' - if the
* cancel fails and the lock was granted, it's easier
- * to just bubble sucess back up to the user.
+ * to just bubble success back up to the user.
*/
ret = ocfs2_flock_handle_signal(lockres, level);
} else if (!ret && (level > lockres->l_level)) {
@@ -1829,8 +1982,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
{
int kick = 0;
- mlog_entry_void();
-
/* If we know that another node is waiting on our lock, kick
* the downconvert thread * pre-emptively when we reach a release
* condition. */
@@ -1851,8 +2002,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
if (kick)
ocfs2_wake_downconvert_thread(osb);
-
- mlog_exit_void();
}
#define OCFS2_SEC_BITS 34
@@ -1881,8 +2030,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- mlog_entry_void();
-
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/*
@@ -1898,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_version = OCFS2_LVB_VERSION;
lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
- lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
- lvb->lvb_igid = cpu_to_be32(inode->i_gid);
+ lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode));
+ lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
lvb->lvb_iatime_packed =
@@ -1914,8 +2061,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
out:
mlog_meta_lvb(0, lockres);
-
- mlog_exit_void();
}
static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -1931,8 +2076,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- mlog_entry_void();
-
mlog_meta_lvb(0, lockres);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -1952,10 +2095,10 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
else
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
- inode->i_gid = be32_to_cpu(lvb->lvb_igid);
+ i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
+ i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
- inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
+ set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
@@ -1963,8 +2106,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
spin_unlock(&oi->ip_lock);
-
- mlog_exit_void();
}
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -1972,7 +2113,8 @@ static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
{
struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
- if (lvb->lvb_version == OCFS2_LVB_VERSION
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
+ && lvb->lvb_version == OCFS2_LVB_VERSION
&& be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
return 1;
return 0;
@@ -1990,8 +2132,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
unsigned long flags;
int status = 0;
- mlog_entry_void();
-
refresh_check:
spin_lock_irqsave(&lockres->l_lock, flags);
if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -2012,7 +2152,7 @@ refresh_check:
status = 1;
bail:
- mlog_exit(status);
+ mlog(0, "status %d\n", status);
return status;
}
@@ -2022,7 +2162,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
int status)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -2031,8 +2170,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
spin_unlock_irqrestore(&lockres->l_lock, flags);
wake_up(&lockres->l_event);
-
- mlog_exit_void();
}
/* may or may not return a bh if it went to disk. */
@@ -2045,8 +2182,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
if (ocfs2_mount_local(osb))
goto bail;
@@ -2066,7 +2201,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
/* This will discard any caching information we might have had
* for the inode metadata. */
- ocfs2_metadata_cache_purge(inode);
+ ocfs2_metadata_cache_purge(INODE_CACHE(inode));
ocfs2_extent_map_trunc(inode, 0);
@@ -2115,7 +2250,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
bail_refresh:
ocfs2_complete_lock_res_refresh(lockres, status);
bail:
- mlog_exit(status);
return status;
}
@@ -2145,10 +2279,11 @@ static int ocfs2_assign_bh(struct inode *inode,
* returns < 0 error if the callback will never be called, otherwise
* the result of the lock will be communicated via the callback.
*/
-int ocfs2_inode_lock_full(struct inode *inode,
- struct buffer_head **ret_bh,
- int ex,
- int arg_flags)
+int ocfs2_inode_lock_full_nested(struct inode *inode,
+ struct buffer_head **ret_bh,
+ int ex,
+ int arg_flags,
+ int subclass)
{
int status, level, acquired;
u32 dlm_flags;
@@ -2158,8 +2293,6 @@ int ocfs2_inode_lock_full(struct inode *inode,
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu, take %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -2171,7 +2304,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
status = -EROFS;
- goto bail;
+ goto getbh;
}
if (ocfs2_mount_local(osb))
@@ -2186,9 +2319,10 @@ int ocfs2_inode_lock_full(struct inode *inode,
if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
dlm_flags |= DLM_LKF_NOQUEUE;
- status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
+ status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
+ arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
@@ -2228,7 +2362,7 @@ local:
mlog_errno(status);
goto bail;
}
-
+getbh:
if (ret_bh) {
status = ocfs2_assign_bh(inode, ret_bh, local_bh);
if (status < 0) {
@@ -2250,7 +2384,6 @@ bail:
if (local_bh)
brelse(local_bh);
- mlog_exit(status);
return status;
}
@@ -2300,7 +2433,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
{
int ret;
- mlog_entry_void();
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
@@ -2328,7 +2460,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
} else
*level = 0;
- mlog_exit(ret);
return ret;
}
@@ -2339,8 +2470,6 @@ void ocfs2_inode_unlock(struct inode *inode,
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
mlog(0, "inode %llu drop %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -2348,8 +2477,47 @@ void ocfs2_inode_unlock(struct inode *inode,
if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
+}
- mlog_exit_void();
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+ int status = 0;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
+ if (status < 0)
+ return status;
+
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+ *seqno = be32_to_cpu(lvb->lvb_os_seqno);
+ else
+ *seqno = osb->osb_orphan_scan.os_seqno + 1;
+
+ return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
+{
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_orphan_scan_lvb *lvb;
+
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
+ lockres = &osb->osb_orphan_scan.os_lockres;
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+ lvb->lvb_os_seqno = cpu_to_be32(seqno);
+ ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+ }
}
int ocfs2_super_lock(struct ocfs2_super *osb,
@@ -2359,8 +2527,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
@@ -2378,21 +2544,18 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* refreshed, so we do it here. Of course, making sense of
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
if (status) {
status = ocfs2_refresh_slot_info(osb);
ocfs2_complete_lock_res_refresh(lockres, status);
- if (status < 0)
+ if (status < 0) {
+ ocfs2_cluster_unlock(osb, lockres, level);
mlog_errno(status);
+ }
ocfs2_track_lock_refresh(lockres);
}
bail:
- mlog_exit(status);
return status;
}
@@ -2469,8 +2632,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
BUG_ON(!dl);
- if (ocfs2_is_hard_readonly(osb))
- return -EROFS;
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ return -EROFS;
+ return 0;
+ }
if (ocfs2_mount_local(osb))
return 0;
@@ -2488,7 +2654,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (!ocfs2_mount_local(osb))
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
}
@@ -2611,8 +2777,15 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
return iter;
}
-/* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 2
+/*
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ * - Lock stats printed
+ * New in version 3
+ * - Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
@@ -2654,18 +2827,18 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
seq_printf(m, "0x%x\t", lvb[i]);
#ifdef CONFIG_OCFS2_FS_STATS
-# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
-# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
-# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
-# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
-# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
-# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
-# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
-# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
-# define lock_refresh(_l) (_l)->l_lock_refresh
+# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets)
+# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets)
+# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail)
+# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail)
+# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total)
+# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total)
+# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
+# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
+# define lock_refresh(_l) ((_l)->l_lock_refresh)
#else
-# define lock_num_prmode(_l) (0ULL)
-# define lock_num_exmode(_l) (0ULL)
+# define lock_num_prmode(_l) (0)
+# define lock_num_exmode(_l) (0)
# define lock_num_prmode_failed(_l) (0)
# define lock_num_exmode_failed(_l) (0)
# define lock_total_prmode(_l) (0ULL)
@@ -2675,8 +2848,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_refresh(_l) (0)
#endif
/* The following seq_print was added in version 2 of this output */
- seq_printf(m, "%llu\t"
- "%llu\t"
+ seq_printf(m, "%u\t"
+ "%u\t"
"%u\t"
"%u\t"
"%llu\t"
@@ -2708,7 +2881,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
{
- struct seq_file *seq = (struct seq_file *) file->private_data;
+ struct seq_file *seq = file->private_data;
struct ocfs2_dlm_seq_priv *priv = seq->private;
struct ocfs2_lock_res *res = &priv->p_iter_res;
@@ -2742,7 +2915,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
goto out;
}
- seq = (struct seq_file *) file->private_data;
+ seq = file->private_data;
seq->private = priv;
ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -2796,8 +2969,6 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
int status = 0;
struct ocfs2_cluster_connection *conn = NULL;
- mlog_entry_void();
-
if (ocfs2_mount_local(osb)) {
osb->node_num = 0;
goto local;
@@ -2820,16 +2991,18 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
/* for now, uuid == domain */
status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
osb->uuid_str,
strlen(osb->uuid_str),
- ocfs2_do_node_down, osb,
+ &lproto, ocfs2_do_node_down, osb,
&conn);
if (status) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_cluster_this_node(&osb->node_num);
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
if (status < 0) {
mlog_errno(status);
mlog(ML_ERROR,
@@ -2842,6 +3015,7 @@ local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
@@ -2853,15 +3027,12 @@ bail:
kthread_stop(osb->dc_task);
}
- mlog_exit(status);
return status;
}
void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
int hangup_pending)
{
- mlog_entry_void();
-
ocfs2_drop_osb_locks(osb);
/*
@@ -2878,56 +3049,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+ ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
osb->cconn = NULL;
ocfs2_dlm_shutdown_debug(osb);
-
- mlog_exit_void();
-}
-
-static void ocfs2_unlock_ast(void *opaque, int error)
-{
- struct ocfs2_lock_res *lockres = opaque;
- unsigned long flags;
-
- mlog_entry_void();
-
- mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
- lockres->l_unlock_action);
-
- spin_lock_irqsave(&lockres->l_lock, flags);
- if (error) {
- mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
- "unlock_action %d\n", error, lockres->l_name,
- lockres->l_unlock_action);
- spin_unlock_irqrestore(&lockres->l_lock, flags);
- return;
- }
-
- switch(lockres->l_unlock_action) {
- case OCFS2_UNLOCK_CANCEL_CONVERT:
- mlog(0, "Cancel convert success for %s\n", lockres->l_name);
- lockres->l_action = OCFS2_AST_INVALID;
- /* Downconvert thread may have requeued this lock, we
- * need to wake it. */
- if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
- ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
- break;
- case OCFS2_UNLOCK_DROP_LOCK:
- lockres->l_level = DLM_LOCK_IV;
- break;
- default:
- BUG();
- }
-
- lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
- lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
- wake_up(&lockres->l_event);
- spin_unlock_irqrestore(&lockres->l_lock, flags);
-
- mlog_exit_void();
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -2997,8 +3124,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
mlog(0, "lock %s\n", lockres->l_name);
- ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
- lockres);
+ ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
if (ret) {
ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3010,26 +3136,63 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
ocfs2_wait_on_busy_lock(lockres);
out:
- mlog_exit(0);
return 0;
}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
* being dequeued from the downconvert thread before we can consider
- * it safe to drop.
+ * it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_mask_waiter mw;
- unsigned long flags;
+ unsigned long flags, flags2;
ocfs2_init_mask_waiter(&mw);
spin_lock_irqsave(&lockres->l_lock, flags);
lockres->l_flags |= OCFS2_LOCK_FREEING;
+ if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+ /*
+ * We know the downconvert is queued but not in progress
+ * because we are the downconvert thread and processing
+ * different lock. So we can just remove the lock from the
+ * queue. This is not only an optimization but also a way
+ * to avoid the following deadlock:
+ * ocfs2_dentry_post_unlock()
+ * ocfs2_dentry_lock_put()
+ * ocfs2_drop_dentry_lock()
+ * iput()
+ * ocfs2_evict_inode()
+ * ocfs2_clear_inode()
+ * ocfs2_mark_lockres_freeing()
+ * ... blocks waiting for OCFS2_LOCK_QUEUED
+ * since we are the downconvert thread which
+ * should clear the flag.
+ */
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ spin_lock_irqsave(&osb->dc_task_lock, flags2);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+ /*
+ * Warn if we recurse into another post_unlock call. Strictly
+ * speaking it isn't a problem but we need to be careful if
+ * that happens (stack overflow, deadlocks, ...) so warn if
+ * ocfs2 grows a path for which this can happen.
+ */
+ WARN_ON_ONCE(lockres->l_ops->post_unlock);
+ /* Since the lock is freeing we don't do much in the fn below */
+ ocfs2_process_blocked_lock(osb, lockres);
+ return;
+ }
while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3050,7 +3213,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
{
int ret;
- ocfs2_mark_lockres_freeing(lockres);
+ ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
@@ -3061,14 +3224,13 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
{
int status, err;
- mlog_entry_void();
-
/* No need to call ocfs2_mark_lockres_freeing here -
* ocfs2_clear_inode has done it for us. */
@@ -3093,7 +3255,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
if (err < 0 && !status)
status = err;
- mlog_exit(status);
return status;
}
@@ -3105,13 +3266,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
if (lockres->l_level <= new_level) {
- mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
- lockres->l_level, new_level);
+ mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
+ "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+ "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+ new_level, list_empty(&lockres->l_blocked_list),
+ list_empty(&lockres->l_mask_waiters), lockres->l_type,
+ lockres->l_flags, lockres->l_ro_holders,
+ lockres->l_ex_holders, lockres->l_action,
+ lockres->l_unlock_action, lockres->l_requested,
+ lockres->l_blocking, lockres->l_pending_gen);
BUG();
}
- mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
- lockres->l_name, new_level, lockres->l_blocking);
+ mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
+ lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
lockres->l_action = OCFS2_AST_DOWNCONVERT;
lockres->l_requested = new_level;
@@ -3128,7 +3296,8 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
int ret;
u32 dlm_flags = DLM_LKF_CONVERT;
- mlog_entry_void();
+ mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+ lockres->l_level, new_level);
if (lvb)
dlm_flags |= DLM_LKF_VALBLK;
@@ -3138,8 +3307,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
&lockres->l_lksb,
dlm_flags,
lockres->l_name,
- OCFS2_LOCK_ID_MAX_LEN - 1,
- lockres);
+ OCFS2_LOCK_ID_MAX_LEN - 1);
lockres_clear_pending(lockres, generation, osb);
if (ret) {
ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3149,7 +3317,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
ret = 0;
bail:
- mlog_exit(ret);
return ret;
}
@@ -3159,15 +3326,11 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
{
assert_spin_locked(&lockres->l_lock);
- mlog_entry_void();
- mlog(0, "lock %s\n", lockres->l_name);
-
if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
/* If we're already trying to cancel a lock conversion
* then just drop the spinlock and allow the caller to
* requeue this lock. */
-
- mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+ mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
return 0;
}
@@ -3182,6 +3345,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
"lock %s, invalid flags: 0x%lx\n",
lockres->l_name, lockres->l_flags);
+ mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
return 1;
}
@@ -3190,19 +3355,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
{
int ret;
- mlog_entry_void();
- mlog(0, "lock %s\n", lockres->l_name);
-
ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
- DLM_LKF_CANCEL, lockres);
+ DLM_LKF_CANCEL);
if (ret) {
ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
ocfs2_recover_from_dlm_error(lockres, 0);
}
- mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
+ mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
- mlog_exit(ret);
return ret;
}
@@ -3213,17 +3374,24 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
unsigned long flags;
int blocking;
int new_level;
+ int level;
int ret = 0;
int set_lvb = 0;
unsigned int gen;
- mlog_entry_void();
-
spin_lock_irqsave(&lockres->l_lock, flags);
- BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-
recheck:
+ /*
+ * Is it still blocking? If not, we have no more work to do.
+ */
+ if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
+ BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ ret = 0;
+ goto leave;
+ }
+
if (lockres->l_flags & OCFS2_LOCK_BUSY) {
/* XXX
* This is a *big* race. The OCFS2_LOCK_PENDING flag
@@ -3248,8 +3416,11 @@ recheck:
* at the same time they set OCFS2_DLM_BUSY. They must
* clear OCFS2_DLM_PENDING after dlm_lock() returns.
*/
- if (lockres->l_flags & OCFS2_LOCK_PENDING)
+ if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+ mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+ lockres->l_name);
goto leave_requeue;
+ }
ctl->requeue = 1;
ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3262,31 +3433,70 @@ recheck:
goto leave;
}
+ /*
+ * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
+ * set when the ast is received for an upconvert just before the
+ * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
+ * on the heels of the ast, we want to delay the downconvert just
+ * enough to allow the up requestor to do its task. Because this
+ * lock is in the blocked queue, the lock will be downconverted
+ * as soon as the requestor is done with the lock.
+ */
+ if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
+ goto leave_requeue;
+
+ /*
+ * How can we block and yet be at NL? We were trying to upconvert
+ * from NL and got canceled. The code comes back here, and now
+ * we notice and clear BLOCKING.
+ */
+ if (lockres->l_level == DLM_LOCK_NL) {
+ BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+ mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
+ lockres->l_blocking = DLM_LOCK_NL;
+ lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ goto leave;
+ }
+
/* if we're blocking an exclusive and we have *any* holders,
* then requeue. */
if ((lockres->l_blocking == DLM_LOCK_EX)
- && (lockres->l_ex_holders || lockres->l_ro_holders))
+ && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+ mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+ lockres->l_name, lockres->l_ex_holders,
+ lockres->l_ro_holders);
goto leave_requeue;
+ }
/* If it's a PR we're blocking, then only
* requeue if we've got any EX holders */
if (lockres->l_blocking == DLM_LOCK_PR &&
- lockres->l_ex_holders)
+ lockres->l_ex_holders) {
+ mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+ lockres->l_name, lockres->l_ex_holders);
goto leave_requeue;
+ }
/*
* Can we get a lock in this state if the holder counts are
* zero? The meta data unblock code used to check this.
*/
if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
- && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
+ && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+ mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+ lockres->l_name);
goto leave_requeue;
+ }
new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
if (lockres->l_ops->check_downconvert
- && !lockres->l_ops->check_downconvert(lockres, new_level))
+ && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+ mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+ lockres->l_name);
goto leave_requeue;
+ }
/* If we get here, then we know that there are no more
* incompatible holders (and anyone asking for an incompatible
@@ -3299,17 +3509,24 @@ recheck:
* may sleep, so we save off a copy of what we're blocking as
* it may change while we're not holding the spin lock. */
blocking = lockres->l_blocking;
+ level = lockres->l_level;
spin_unlock_irqrestore(&lockres->l_lock, flags);
ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
- if (ctl->unblock_action == UNBLOCK_STOP_POST)
+ if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+ mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+ lockres->l_name);
goto leave;
+ }
spin_lock_irqsave(&lockres->l_lock, flags);
- if (blocking != lockres->l_blocking) {
+ if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
/* If this changed underneath us, then we can't drop
* it just yet. */
+ mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+ "Recheck\n", lockres->l_name, blocking,
+ lockres->l_blocking, level, lockres->l_level);
goto recheck;
}
@@ -3336,14 +3553,14 @@ downconvert:
gen);
leave:
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
leave_requeue:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ctl->requeue = 1;
- mlog_exit(0);
return 0;
}
@@ -3352,10 +3569,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
{
struct inode *inode;
struct address_space *mapping;
+ struct ocfs2_inode_info *oi;
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
+ if (S_ISDIR(inode->i_mode)) {
+ oi = OCFS2_I(inode);
+ oi->ip_dir_lock_gen++;
+ mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+ goto out;
+ }
+
if (!S_ISREG(inode->i_mode))
goto out;
@@ -3388,11 +3613,11 @@ out:
return UNBLOCK_CONTINUE;
}
-static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
- int new_level)
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+ struct ocfs2_lock_res *lockres,
+ int new_level)
{
- struct inode *inode = ocfs2_lock_res_inode(lockres);
- int checkpointed = ocfs2_inode_fully_checkpointed(inode);
+ int checkpointed = ocfs2_ci_fully_checkpointed(ci);
BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3400,10 +3625,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
if (checkpointed)
return 1;
- ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
+ ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
return 0;
}
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
+{
+ struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+ return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
{
struct inode *inode = ocfs2_lock_res_inode(lockres);
@@ -3533,6 +3766,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
return UNBLOCK_CONTINUE_POST;
}
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+ int new_level)
+{
+ struct ocfs2_refcount_tree *tree =
+ ocfs2_lock_res_refcount_tree(lockres);
+
+ return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+ int blocking)
+{
+ struct ocfs2_refcount_tree *tree =
+ ocfs2_lock_res_refcount_tree(lockres);
+
+ ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+ return UNBLOCK_CONTINUE;
+}
+
static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
{
struct ocfs2_qinfo_lvb *lvb;
@@ -3540,8 +3793,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
oinfo->dqi_gi.dqi_type);
- mlog_entry_void();
-
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
@@ -3550,8 +3801,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
-
- mlog_exit_void();
}
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
@@ -3560,10 +3809,8 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
- mlog_entry_void();
if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres, level);
- mlog_exit_void();
}
static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
@@ -3576,7 +3823,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
struct ocfs2_global_disk_dqinfo *gdinfo;
int status = 0;
- if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+ if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+ lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
@@ -3585,7 +3833,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
oinfo->dqi_gi.dqi_free_entry =
be32_to_cpu(lvb->lvb_free_entry);
} else {
- status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+ status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+ oinfo->dqi_giblk, &bh);
if (status) {
mlog_errno(status);
goto bail;
@@ -3616,8 +3865,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
int status = 0;
- mlog_entry_void();
-
/* On RO devices, locking really isn't needed... */
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
@@ -3640,48 +3887,39 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
ocfs2_qinfo_unlock(oinfo, ex);
ocfs2_complete_lock_res_refresh(lockres, status);
bail:
- mlog_exit(status);
return status;
}
-/*
- * This is the filesystem locking protocol. It provides the lock handling
- * hooks for the underlying DLM. It has a maximum version number.
- * The version number allows interoperability with systems running at
- * the same major number and an equal or smaller minor number.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed. The protocol is negotiated when joining
- * the dlm domain. A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes. When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero. If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased. If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-static struct ocfs2_locking_protocol lproto = {
- .lp_max_version = {
- .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
- .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
- },
- .lp_lock_ast = ocfs2_locking_ast,
- .lp_blocking_ast = ocfs2_blocking_ast,
- .lp_unlock_ast = ocfs2_unlock_ast,
-};
-
-void ocfs2_set_locking_protocol(void)
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
{
- ocfs2_stack_glue_set_locking_protocol(&lproto);
+ int status;
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+ struct ocfs2_super *osb = lockres->l_priv;
+
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+ return status;
}
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+ struct ocfs2_super *osb = lockres->l_priv;
+
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+}
static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
@@ -3694,12 +3932,10 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
* considered valid until we remove the OCFS2_LOCK_QUEUED
* flag. */
- mlog_entry_void();
-
BUG_ON(!lockres);
BUG_ON(!lockres->l_ops);
- mlog(0, "lockres %s blocked.\n", lockres->l_name);
+ mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
/* Detect whether a lock has been marked as going away while
* the downconvert thread was processing other things. A lock can
@@ -3722,21 +3958,19 @@ unqueue:
} else
ocfs2_schedule_blocked_lock(osb, lockres);
- mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+ mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
ctl.requeue ? "yes" : "no");
spin_unlock_irqrestore(&lockres->l_lock, flags);
if (ctl.unblock_action != UNBLOCK_CONTINUE
&& lockres->l_ops->post_unlock)
lockres->l_ops->post_unlock(osb, lockres);
-
- mlog_exit_void();
}
static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
+ unsigned long flags;
assert_spin_locked(&lockres->l_lock);
@@ -3744,32 +3978,29 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
/* Do not schedule a lock for downconvert when it's on
* the way to destruction - any nodes wanting access
* to the resource will get it soon. */
- mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+ mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
lockres->l_name, lockres->l_flags);
return;
}
lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (list_empty(&lockres->l_blocked_list)) {
list_add_tail(&lockres->l_blocked_list,
&osb->blocked_lock_list);
osb->blocked_lock_count++;
}
- spin_unlock(&osb->dc_task_lock);
-
- mlog_exit_void();
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
}
static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
{
unsigned long processed;
+ unsigned long flags;
struct ocfs2_lock_res *lockres;
- mlog_entry_void();
-
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
/* grab this early so we know to try again if a state change and
* wake happens part-way through our work */
osb->dc_work_sequence = osb->dc_wake_sequence;
@@ -3782,40 +4013,40 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
osb->blocked_lock_count--;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
BUG_ON(!processed);
processed--;
ocfs2_process_blocked_lock(osb, lockres);
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
}
- spin_unlock(&osb->dc_task_lock);
-
- mlog_exit_void();
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
}
static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
{
int empty = 0;
+ unsigned long flags;
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (list_empty(&osb->blocked_lock_list))
empty = 1;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
return empty;
}
static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
{
int should_wake = 0;
+ unsigned long flags;
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (osb->dc_work_sequence != osb->dc_wake_sequence)
should_wake = 1;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
return should_wake;
}
@@ -3845,10 +4076,12 @@ static int ocfs2_downconvert_thread(void *arg)
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
{
- spin_lock(&osb->dc_task_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
/* make sure the voting thread gets a swipe at whatever changes
* the caller may have made to the voting state */
osb->dc_wake_sequence++;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
wake_up(&osb->dc_event);
}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd5721cd7..d293a22c32c 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
__be32 lvb_free_entry;
};
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_os_seqno;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -70,6 +78,15 @@ struct ocfs2_qinfo_lvb {
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
+/* Locking subclasses of inode cluster lock */
+enum {
+ OI_LS_NORMAL = 0,
+ OI_LS_PARENT,
+ OI_LS_RENAME1,
+ OI_LS_RENAME2,
+ OI_LS_REFLINK_TARGET,
+};
+
int ocfs2_dlm_init(struct ocfs2_super *osb);
void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
@@ -85,6 +102,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_mem_dqinfo;
void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_super *osb, u64 ref_blkno,
+ unsigned int generation);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
@@ -96,23 +116,32 @@ void ocfs2_open_unlock(struct inode *inode);
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
-int ocfs2_inode_lock_full(struct inode *inode,
+int ocfs2_inode_lock_full_nested(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
- int arg_flags);
+ int arg_flags,
+ int subclass);
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page);
+/* Variants without special locking class or flags */
+#define ocfs2_inode_lock_full(i, r, e, f)\
+ ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
+#define ocfs2_inode_lock_nested(i, b, e, s)\
+ ocfs2_inode_lock_full_nested(i, b, e, 0, s)
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
-#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full(i, b, e, 0)
+#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
+
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
@@ -123,9 +152,13 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct ocfs2_refcount_tree;
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 15713cbb865..29651167190 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_EXPORT
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -40,6 +39,7 @@
#include "buffer_head_io.h"
#include "suballoc.h"
+#include "ocfs2_trace.h"
struct ocfs2_inode_handle
{
@@ -56,10 +56,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
int status, set;
struct dentry *result;
- mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+ trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
if (blkno == 0) {
- mlog(0, "nfs wants inode with blkno: 0\n");
result = ERR_PTR(-ESTALE);
goto bail;
}
@@ -83,6 +82,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
}
status = ocfs2_test_inode_bit(osb, blkno, &set);
+ trace_ocfs2_get_dentry_test_bit(status, set);
if (status < 0) {
if (status == -EINVAL) {
/*
@@ -90,18 +90,14 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
* as an inode, we return -ESTALE to be
* nice
*/
- mlog(0, "test inode bit failed %d\n", status);
status = -ESTALE;
- } else {
+ } else
mlog(ML_ERROR, "test inode bit failed %d\n", status);
- }
goto unlock_nfs_sync;
}
/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
- mlog(0, "inode %llu suballoc bit is clear\n",
- (unsigned long long)blkno);
status = -ESTALE;
goto unlock_nfs_sync;
}
@@ -114,8 +110,8 @@ unlock_nfs_sync:
check_err:
if (status < 0) {
if (status == -ESTALE) {
- mlog(0, "stale inode ino: %llu generation: %u\n",
- (unsigned long long)blkno, handle->ih_generation);
+ trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+ handle->ih_generation);
}
result = ERR_PTR(status);
goto bail;
@@ -130,20 +126,19 @@ check_err:
check_gen:
if (handle->ih_generation != inode->i_generation) {
iput(inode);
- mlog(0, "stale inode ino: %llu generation: %u\n",
- (unsigned long long)blkno, handle->ih_generation);
+ trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+ handle->ih_generation,
+ inode->i_generation);
result = ERR_PTR(-ESTALE);
goto bail;
}
result = d_obtain_alias(inode);
- if (!IS_ERR(result))
- result->d_op = &ocfs2_dentry_ops;
- else
+ if (IS_ERR(result))
mlog_errno(PTR_ERR(result));
bail:
- mlog_exit_ptr(result);
+ trace_ocfs2_get_dentry_end(result);
return result;
}
@@ -154,11 +149,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
struct dentry *parent;
struct inode *dir = child->d_inode;
- mlog_entry("(0x%p, '%.*s')\n", child,
- child->d_name.len, child->d_name.name);
-
- mlog(0, "find parent of directory %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno);
+ trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno);
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
@@ -175,55 +167,53 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
}
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
- if (!IS_ERR(parent))
- parent->d_op = &ocfs2_dentry_ops;
bail_unlock:
ocfs2_inode_unlock(dir, 0);
bail:
- mlog_exit_ptr(parent);
+ trace_ocfs2_get_parent_end(parent);
return parent;
}
-static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
- int connectable)
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type = 1;
u64 blkno;
u32 generation;
__le32 *fh = (__force __le32 *) fh_in;
- mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
- dentry->d_name.len, dentry->d_name.name,
- fh, len, connectable);
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then. Somehow"
+ trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ fh, len, connectable);
+#endif
- if (len < 3 || (connectable && len < 6)) {
- mlog(ML_ERROR, "fh buffer is too small for encoding\n");
- type = 255;
+ if (parent && (len < 6)) {
+ *max_len = 6;
+ type = FILEID_INVALID;
+ goto bail;
+ } else if (len < 3) {
+ *max_len = 3;
+ type = FILEID_INVALID;
goto bail;
}
blkno = OCFS2_I(inode)->ip_blkno;
generation = inode->i_generation;
- mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
- (unsigned long long)blkno, generation);
+ trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
len = 3;
fh[0] = cpu_to_le32((u32)(blkno >> 32));
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[2] = cpu_to_le32(generation);
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
+ if (parent) {
blkno = OCFS2_I(parent)->ip_blkno;
generation = parent->i_generation;
@@ -231,19 +221,17 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[5] = cpu_to_le32(generation);
- spin_unlock(&dentry->d_lock);
-
len = 6;
type = 2;
- mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
- (unsigned long long)blkno, generation);
+ trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+ generation);
}
-
+
*max_len = len;
bail:
- mlog_exit(type);
+ trace_ocfs2_encode_fh_type(type);
return type;
}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2bb1a04d25..767370b656c 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,10 +24,10 @@
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fiemap.h>
-#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -37,6 +37,8 @@
#include "extent_map.h"
#include "inode.h"
#include "super.h"
+#include "symlink.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -191,7 +193,7 @@ static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
emi->ei_clusters += ins->ei_clusters;
return 1;
} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
- (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+ (ins->ei_cpos + ins->ei_clusters) == emi->ei_cpos &&
ins->ei_flags == emi->ei_flags) {
emi->ei_phys = ins->ei_phys;
emi->ei_cpos = ins->ei_cpos;
@@ -280,8 +282,7 @@ search:
spin_unlock(&oi->ip_lock);
out:
- if (new_emi)
- kfree(new_emi);
+ kfree(new_emi);
}
static int ocfs2_last_eb_is_empty(struct inode *inode,
@@ -293,7 +294,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
+ ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -353,11 +354,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
* eb_bh is NULL. Otherwise, eb_bh should point to the extent block
* containing el.
*/
-static int ocfs2_figure_hole_clusters(struct inode *inode,
- struct ocfs2_extent_list *el,
- struct buffer_head *eb_bh,
- u32 v_cluster,
- u32 *num_clusters)
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters)
{
int ret, i;
struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +376,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
- ret = ocfs2_read_extent_block(inode,
+ ret = ocfs2_read_extent_block(ci,
le64_to_cpu(eb->h_next_leaf_blk),
&next_eb_bh);
if (ret) {
@@ -428,7 +429,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
tree_height = le16_to_cpu(el->l_tree_depth);
if (tree_height > 0) {
- ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -451,11 +453,12 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (i == -1) {
/*
* Holes can be larger than the maximum size of an
- * extent, so we return their lengths in a seperate
+ * extent, so we return their lengths in a separate
* field.
*/
if (hole_len) {
- ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+ ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+ el, eb_bh,
v_cluster, &len);
if (ret) {
mlog_errno(ret);
@@ -539,7 +542,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
- struct ocfs2_extent_list *el)
+ struct ocfs2_extent_list *el,
+ unsigned int *extent_flags)
{
int ret = 0, i;
struct buffer_head *eb_bh = NULL;
@@ -548,7 +552,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 coff;
if (el->l_tree_depth) {
- ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -590,6 +595,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
*p_cluster = *p_cluster + coff;
if (num_clusters)
*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+ if (extent_flags)
+ *extent_flags = rec->e_flags;
}
out:
if (eb_bh)
@@ -696,6 +704,12 @@ out:
return ret;
}
+/*
+ * The ocfs2_fiemap_inline() may be a little bit misleading, since
+ * it not only handles the fiemap for inlined files, but also deals
+ * with the fast symlink, cause they have no difference for extent
+ * mapping per se.
+ */
static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct fiemap_extent_info *fieinfo,
u64 map_start)
@@ -708,11 +722,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
struct ocfs2_inode_info *oi = OCFS2_I(inode);
di = (struct ocfs2_dinode *)di_bh->b_data;
- id_count = le16_to_cpu(di->id2.i_data.id_count);
+ if (ocfs2_inode_is_fast_symlink(inode))
+ id_count = ocfs2_fast_symlink_chars(inode->i_sb);
+ else
+ id_count = le16_to_cpu(di->id2.i_data.id_count);
if (map_start < id_count) {
phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
- phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+ if (ocfs2_inode_is_fast_symlink(inode))
+ phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
+ else
+ phys += offsetof(struct ocfs2_dinode,
+ id2.i_data.id_data);
ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
flags);
@@ -749,9 +770,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
down_read(&OCFS2_I(inode)->ip_alloc_sem);
/*
- * Handle inline-data separately.
+ * Handle inline-data and fast symlink separately.
*/
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+ ocfs2_inode_is_fast_symlink(inode)) {
ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
goto out_unlock;
}
@@ -759,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
cpos = map_start >> osb->s_clustersize_bits;
mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
map_start + map_len);
- mapping_end -= cpos;
is_last = 0;
while (cpos < mapping_end && !is_last) {
u32 fe_flags;
@@ -768,7 +789,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
&hole_size, &rec, &is_last);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_unlock;
}
if (rec.e_blkno == 0ULL) {
@@ -779,6 +800,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
fe_flags = 0;
if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+ if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+ fe_flags |= FIEMAP_EXTENT_SHARED;
if (is_last)
fe_flags |= FIEMAP_EXTENT_LAST;
len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
@@ -807,6 +830,100 @@ out:
return ret;
}
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+ unsigned int is_last = 0, is_data = 0;
+ u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 cpos, cend, clen, hole_size;
+ u64 extoff, extlen;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (*offset >= i_size_read(inode)) {
+ ret = -ENXIO;
+ goto out_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (whence == SEEK_HOLE)
+ *offset = i_size_read(inode);
+ goto out_unlock;
+ }
+
+ clen = 0;
+ cpos = *offset >> cs_bits;
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
+ while (cpos < cend && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+ &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ extoff = cpos;
+ extoff <<= cs_bits;
+
+ if (rec.e_blkno == 0ULL) {
+ clen = hole_size;
+ is_data = 0;
+ } else {
+ clen = le16_to_cpu(rec.e_leaf_clusters) -
+ (cpos - le32_to_cpu(rec.e_cpos));
+ is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
+ }
+
+ if ((!is_data && whence == SEEK_HOLE) ||
+ (is_data && whence == SEEK_DATA)) {
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ if (!is_last)
+ cpos += clen;
+ }
+
+ if (whence == SEEK_HOLE) {
+ extoff = cpos;
+ extoff <<= cs_bits;
+ extlen = clen;
+ extlen <<= cs_bits;
+
+ if ((extoff + extlen) > i_size_read(inode))
+ extlen = i_size_read(inode) - extoff;
+ extoff += extlen;
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ ret = -ENXIO;
+
+out_unlock:
+
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ return ret;
+}
+
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -816,10 +933,9 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
u64 p_block, p_count;
int i, count, done = 0;
- mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
- "flags = %x, validate = %p)\n",
- inode, (unsigned long long)v_block, nr, bhs, flags,
- validate);
+ trace_ocfs2_read_virt_blocks(
+ inode, (unsigned long long)v_block, nr, bhs, flags,
+ validate);
if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
i_size_read(inode)) {
@@ -862,8 +978,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
}
- rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
- flags, validate);
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
+ bhs + done, flags, validate);
if (rc) {
mlog_errno(rc);
break;
@@ -872,7 +988,6 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
}
out:
- mlog_exit(rc);
return rc;
}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd9731b46..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,14 +53,22 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
- struct ocfs2_extent_list *el);
+ struct ocfs2_extent_list *el,
+ unsigned int *extent_flags);
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+ struct ocfs2_extent_list *el,
+ struct buffer_head *eb_bh,
+ u32 v_cluster,
+ u32 *num_clusters);
static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
struct buffer_head **bh,
int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c2a87c885b7..2930e231f3f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,8 +36,8 @@
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -59,15 +59,11 @@
#include "xattr.h"
#include "acl.h"
#include "quota.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
- return sync_mapping_buffers(inode->i_mapping);
-}
-
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp;
@@ -103,8 +99,13 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
int mode = file->f_flags;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
- file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+ trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name, mode);
+
+ if (file->f_mode & FMODE_WRITE)
+ dquot_initialize(inode);
spin_lock(&oi->ip_lock);
@@ -136,7 +137,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
}
leave:
- mlog_exit(status);
return status;
}
@@ -144,19 +144,19 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
- file->f_path.dentry->d_name.len,
- file->f_path.dentry->d_name.name);
-
spin_lock(&oi->ip_lock);
if (!--oi->ip_open_count)
oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+ trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+ oi->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ oi->ip_open_count);
spin_unlock(&oi->ip_lock);
ocfs2_free_file_private(inode, file);
- mlog_exit(0);
-
return 0;
}
@@ -171,27 +171,44 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
return 0;
}
-static int ocfs2_sync_file(struct file *file,
- struct dentry *dentry,
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
int datasync)
{
int err = 0;
- journal_t *journal;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
+ int ret;
+ tid_t commit_tid;
+ bool needs_barrier = false;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+ OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ (unsigned long long)datasync);
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
- err = ocfs2_sync_inode(dentry->d_inode);
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
- goto bail;
+ return err;
- journal = osb->journal->j_journal;
- err = jbd2_journal_force_commit(journal);
+ commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ err = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!err)
+ err = ret;
+ }
-bail:
- mlog_exit(err);
+ if (err)
+ mlog_errno(err);
return (err < 0) ? -EIO : 0;
}
@@ -247,8 +264,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
handle_t *handle;
struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
- mlog_entry_void();
-
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -256,7 +271,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -271,15 +286,12 @@ int ocfs2_update_inode_atime(struct inode *inode,
inode->i_atime = CURRENT_TIME;
di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+ ocfs2_journal_dirty(handle, bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
- mlog_exit(ret);
return ret;
}
@@ -290,7 +302,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
{
int status;
- mlog_entry_void();
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -302,7 +313,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
}
bail:
- mlog_exit(status);
return status;
}
@@ -326,11 +336,45 @@ int ocfs2_simple_size_update(struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_commit_trans(osb, handle);
out:
return ret;
}
+static int ocfs2_cow_file_pos(struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 offset)
+{
+ int status;
+ u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ /*
+ * If the new offset is aligned to the range of the cluster, there is
+ * no space for ocfs2_zero_range_for_truncate to fill, so no need to
+ * CoW either.
+ */
+ if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
+ return 0;
+
+ status = ocfs2_get_clusters(inode, cpos, &phys,
+ &num_clusters, &ext_flags);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
+
+ return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+
+out:
+ return status;
+}
+
static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
@@ -341,7 +385,16 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
u64 cluster_bytes;
- mlog_entry_void();
+ /*
+ * We need to CoW the cluster contains the offset if it is reflinked
+ * since we will call ocfs2_zero_range_for_truncate later which will
+ * write "0" from offset to the end of the cluster.
+ */
+ status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
/* TODO: This needs to actually orphan the inode in this
* transaction. */
@@ -353,7 +406,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -378,16 +431,13 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
di->i_size = cpu_to_le64(new_i_size);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, fe_bh);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
-
- mlog_exit(status);
return status;
}
@@ -398,16 +448,15 @@ static int ocfs2_truncate_file(struct inode *inode,
int status = 0;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_truncate_context *tc = NULL;
-
- mlog_entry("(inode = %llu, new_i_size = %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)new_i_size);
/* We trust di_bh because it comes from ocfs2_inode_lock(), which
* already validated it */
fe = (struct ocfs2_dinode *) di_bh->b_data;
+ trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ (unsigned long long)new_i_size);
+
mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
"Inode %llu, inode i_size = %lld != di "
"i_size = %llu, i_flags = 0x%x\n",
@@ -417,26 +466,19 @@ static int ocfs2_truncate_file(struct inode *inode,
le32_to_cpu(fe->i_flags));
if (new_i_size > le64_to_cpu(fe->i_size)) {
- mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
- (unsigned long long)le64_to_cpu(fe->i_size),
- (unsigned long long)new_i_size);
+ trace_ocfs2_truncate_file_error(
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ (unsigned long long)new_i_size);
status = -EINVAL;
mlog_errno(status);
goto bail;
}
- mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- (unsigned long long)le64_to_cpu(fe->i_size),
- (unsigned long long)new_i_size);
-
- /* lets handle the simple truncate cases before doing any more
- * cluster locking. */
- if (new_i_size == le64_to_cpu(fe->i_size))
- goto bail;
-
down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ocfs2_resv_discard(&osb->osb_la_resmap,
+ &OCFS2_I(inode)->ip_la_data_resv);
+
/*
* The inode lock forced other nodes to sync and drop their
* pages, which (correctly) happens even if we have a truncate
@@ -466,13 +508,7 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
- status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail_unlock_sem;
- }
-
- status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
goto bail_unlock_sem;
@@ -483,8 +519,9 @@ bail_unlock_sem:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail:
+ if (!status && OCFS2_I(inode)->ip_clusters == 0)
+ status = ocfs2_try_remove_refcount_tree(inode, di_bh);
- mlog_exit(status);
return status;
}
@@ -512,11 +549,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ret;
struct ocfs2_extent_tree et;
- ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
- ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
- clusters_to_add, mark_unwritten,
- &et, handle,
- data_ac, meta_ac, reason_ret);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
+ ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
return ret;
}
@@ -538,10 +574,8 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
struct ocfs2_extent_tree et;
int did_quota = 0;
- mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
-
/*
- * This function only exists for file systems which don't
+ * Unwritten extent only exists for file systems which
* support holes.
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
@@ -556,12 +590,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
- "clusters_to_add = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
- clusters_to_add);
- ocfs2_init_dinode_extent_tree(&et, inode, bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
&data_ac, &meta_ac);
if (status) {
@@ -569,8 +598,7 @@ restart_all:
goto leave;
}
- credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
- clusters_to_add);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -580,17 +608,22 @@ restart_all:
}
restarted_transaction:
- if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
- clusters_to_add))) {
- status = -EDQUOT;
+ trace_ocfs2_extend_allocation(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)i_size_read(inode),
+ le32_to_cpu(fe->i_clusters), clusters_to_add,
+ why, restart_func);
+
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ if (status)
goto leave;
- }
did_quota = 1;
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path, we can still
* update i_size. */
- status = ocfs2_journal_access_di(handle, inode, bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -614,34 +647,25 @@ restarted_transaction:
mlog_errno(status);
goto leave;
}
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(inode)->ip_lock);
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
/* Release unused quota reservation */
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
did_quota = 0;
if (why != RESTART_NONE && clusters_to_add) {
if (why == RESTART_META) {
- mlog(0, "restarting function.\n");
restart_func = 1;
+ status = 0;
} else {
BUG_ON(why != RESTART_TRANS);
- mlog(0, "restarting transaction.\n");
- /* TODO: This can be more intelligent. */
- credits = ocfs2_calc_extend_credits(osb->sb,
- &fe->id2.i_list,
- clusters_to_add);
- status = ocfs2_extend_trans(handle, credits);
+ status = ocfs2_allocate_extend_trans(handle, 1);
if (status < 0) {
/* handle still has to be committed at
* this point. */
@@ -653,15 +677,15 @@ restarted_transaction:
}
}
- mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+ trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
le32_to_cpu(fe->i_clusters),
- (unsigned long long)le64_to_cpu(fe->i_size));
- mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
- OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ OCFS2_I(inode)->ip_clusters,
+ (unsigned long long)i_size_read(inode));
leave:
if (status < 0 && did_quota)
- vfs_dq_free_space(inode,
+ dquot_free_space(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (handle) {
ocfs2_commit_trans(osb, handle);
@@ -682,65 +706,142 @@ leave:
brelse(bh);
bh = NULL;
- mlog_exit(status);
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (!ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
- int ret;
+ int ret = 0;
+ unsigned zero_from, zero_to, block_start, block_end;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ trace_ocfs2_write_zero_page(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)abs_from,
+ (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from; block_start < zero_to;
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to force
+ * __block_write_begin and block_commit_write to zero the
+ * whole block.
+ */
+ ret = __block_write_begin(page, block_start + 1, 0,
+ ocfs2_get_block);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
- }
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode,
+ di_bh);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
+
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
+ }
- if (handle)
+ if (handle) {
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_page().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ di->i_mtime_nsec = di->i_ctime_nsec;
+ ocfs2_journal_dirty(handle, di_bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ }
+
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -748,22 +849,115 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
{
- int ret = 0;
- u64 start_off;
- struct super_block *sb = inode->i_sb;
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
- if (ret < 0) {
- mlog_errno(ret);
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
goto out;
}
- start_off += sb->s_blocksize;
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
+ zero_clusters, UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end, struct buffer_head *di_bh)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ trace_ocfs2_zero_extend_range(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
@@ -772,16 +966,63 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
-out:
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
+{
+ int ret = 0;
+ u64 zero_start, range_start = 0, range_end = 0;
+ struct super_block *sb = inode->i_sb;
+
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
return ret;
}
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
{
int ret;
u32 clusters_to_add;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
if (clusters_to_add < oi->ip_clusters)
clusters_to_add = 0;
@@ -802,7 +1043,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
* still need to zero the area between the old i_size and the
* new i_size.
*/
- ret = ocfs2_zero_extend(inode, zero_to);
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
if (ret < 0)
mlog_errno(ret);
@@ -824,27 +1065,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -861,14 +1090,16 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
up_write(&oi->ip_alloc_sem);
@@ -894,39 +1125,31 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
handle_t *handle = NULL;
- int locked[MAXQUOTAS] = {0, 0};
- int credits, qtype;
- struct ocfs2_mem_dqinfo *oinfo;
+ struct dquot *transfer_to[MAXQUOTAS] = { };
+ int qtype;
- mlog_entry("(0x%p, '%.*s')\n", dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_setattr(inode, dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ dentry->d_name.len, dentry->d_name.name,
+ attr->ia_valid, attr->ia_mode,
+ from_kuid(&init_user_ns, attr->ia_uid),
+ from_kgid(&init_user_ns, attr->ia_gid));
/* ensuring we don't even attempt to truncate a symlink */
if (S_ISLNK(inode->i_mode))
attr->ia_valid &= ~ATTR_SIZE;
- if (attr->ia_valid & ATTR_MODE)
- mlog(0, "mode change: %d\n", attr->ia_mode);
- if (attr->ia_valid & ATTR_UID)
- mlog(0, "uid change: %d\n", attr->ia_uid);
- if (attr->ia_valid & ATTR_GID)
- mlog(0, "gid change: %d\n", attr->ia_gid);
- if (attr->ia_valid & ATTR_SIZE)
- mlog(0, "size change...\n");
- if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
- mlog(0, "time change...\n");
-
#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
| ATTR_GID | ATTR_UID | ATTR_MODE)
- if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
- mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+ if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
return 0;
- }
status = inode_change_ok(inode, attr);
if (status)
return status;
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
status = ocfs2_rw_lock(inode, 1);
@@ -943,13 +1166,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock_rw;
}
- if (size_change && attr->ia_size != i_size_read(inode)) {
- if (attr->ia_size > sb->s_maxbytes) {
- status = -EFBIG;
+ if (size_change) {
+ status = inode_newsize_ok(inode, attr->ia_size);
+ if (status)
goto bail_unlock;
- }
- if (i_size_read(inode) > attr->ia_size) {
+ inode_dio_wait(inode);
+
+ if (i_size_read(inode) >= attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
attr->ia_size);
@@ -967,38 +1191,39 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
- credits = OCFS2_INODE_UPDATE_CREDITS;
- if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+ if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+ /*
+ * Gather pointers to quota structures so that allocation /
+ * freeing of quota structures happens here and not inside
+ * dquot_transfer() where we have problems with lock ordering
+ */
+ if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
- oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
- status = ocfs2_lock_global_qf(oinfo, 1);
- if (status < 0)
+ transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
+ if (!transfer_to[USRQUOTA]) {
+ status = -ESRCH;
goto bail_unlock;
- credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
- ocfs2_calc_qdel_credits(sb, USRQUOTA);
- locked[USRQUOTA] = 1;
+ }
}
- if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+ if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
- oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
- status = ocfs2_lock_global_qf(oinfo, 1);
- if (status < 0)
+ transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
+ if (!transfer_to[GRPQUOTA]) {
+ status = -ESRCH;
goto bail_unlock;
- credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
- ocfs2_calc_qdel_credits(sb, GRPQUOTA);
- locked[GRPQUOTA] = 1;
+ }
}
- handle = ocfs2_start_trans(osb, credits);
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+ 2 * ocfs2_quota_trans_credits(sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto bail_unlock;
}
- status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ status = __dquot_transfer(inode, transfer_to);
if (status < 0)
goto bail_commit;
} else {
@@ -1010,18 +1235,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- /*
- * This will intentionally not wind up calling vmtruncate(),
- * since all the work for a size change has been done above.
- * Otherwise, we could get into problems with truncate as
- * ip_alloc_sem is used there to protect against i_size
- * changes.
- */
- status = inode_setattr(inode, attr);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
if (status < 0)
@@ -1030,12 +1245,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
- if (!locked[qtype])
- continue;
- oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
- ocfs2_unlock_global_qf(oinfo, 1);
- }
ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
@@ -1043,13 +1252,16 @@ bail_unlock_rw:
bail:
brelse(bh);
+ /* Release quota pointers in case we acquired them */
+ for (qtype = 0; qtype < MAXQUOTAS; qtype++)
+ dqput(transfer_to[qtype]);
+
if (!status && attr->ia_valid & ATTR_MODE) {
- status = ocfs2_acl_chmod(inode);
+ status = posix_acl_chmod(inode, inode->i_mode);
if (status < 0)
mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -1062,8 +1274,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
struct ocfs2_super *osb = sb->s_fs_info;
int err;
- mlog_entry_void();
-
err = ocfs2_inode_revalidate(dentry);
if (err) {
if (err != -ENOENT)
@@ -1077,8 +1287,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
stat->blksize = osb->s_clustersize;
bail:
- mlog_exit(err);
-
return err;
}
@@ -1086,7 +1294,8 @@ int ocfs2_permission(struct inode *inode, int mask)
{
int ret;
- mlog_entry_void();
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret) {
@@ -1095,11 +1304,10 @@ int ocfs2_permission(struct inode *inode, int mask)
goto out;
}
- ret = generic_permission(inode, mask, ocfs2_check_acl);
+ ret = generic_permission(inode, mask);
ocfs2_inode_unlock(inode, 0);
out:
- mlog_exit(ret);
return ret;
}
@@ -1111,8 +1319,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;
- mlog_entry("(Inode %llu, mode 0%o)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+ trace_ocfs2_write_remove_suid(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ inode->i_mode);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
@@ -1121,7 +1330,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -1134,15 +1343,13 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
di = (struct ocfs2_dinode *) bh->b_data;
di->i_mode = cpu_to_le16(inode->i_mode);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out_trans:
ocfs2_commit_trans(osb, handle);
out:
- mlog_exit(ret);
return ret;
}
@@ -1321,8 +1528,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
* partial clusters here. There's no need to worry about
* physical allocation - the zeroing code knows to skip holes.
*/
- mlog(0, "byte start: %llu, end: %llu\n",
- (unsigned long long)start, (unsigned long long)end);
+ trace_ocfs2_zero_partial_clusters(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)start, (unsigned long long)end);
/*
* If both edges are on a cluster boundary then there's no
@@ -1346,8 +1554,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if (tmpend > end)
tmpend = end;
- mlog(0, "1st range: start: %llu, tmpend: %llu\n",
- (unsigned long long)start, (unsigned long long)tmpend);
+ trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+ (unsigned long long)tmpend);
ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
if (ret)
@@ -1361,33 +1569,125 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
*/
start = end & ~(osb->s_clustersize - 1);
- mlog(0, "2nd range: start: %llu, end: %llu\n",
- (unsigned long long)start, (unsigned long long)end);
+ trace_ocfs2_zero_partial_clusters_range2(
+ (unsigned long long)start, (unsigned long long)end);
ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
if (ret)
mlog_errno(ret);
}
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(osb, handle);
out:
return ret;
}
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+ int i;
+ struct ocfs2_extent_rec *rec = NULL;
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) < pos)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *rec,
+ u32 trunc_start, u32 *trunc_cpos,
+ u32 *trunc_len, u32 *trunc_end,
+ u64 *blkno, int *done)
+{
+ int ret = 0;
+ u32 coff, range;
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+ /*
+ * remove an entire extent record.
+ */
+ *trunc_cpos = le32_to_cpu(rec->e_cpos);
+ /*
+ * Skip holes if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno);
+ *trunc_end = le32_to_cpu(rec->e_cpos);
+ } else if (range > trunc_start) {
+ /*
+ * remove a partial extent record, which means we're
+ * removing the last extent record.
+ */
+ *trunc_cpos = trunc_start;
+ /*
+ * skip hole if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - trunc_start;
+ coff = trunc_start - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
+ *trunc_end = trunc_start;
+ } else {
+ /*
+ * It may have two following possibilities:
+ *
+ * - last record has been removed
+ * - trunc_start was within a hole
+ *
+ * both two cases mean the completion of hole punching.
+ */
+ ret = 1;
+ }
+
+ *done = ret;
+}
+
static int ocfs2_remove_inode_range(struct inode *inode,
struct buffer_head *di_bh, u64 byte_start,
u64 byte_len)
{
- int ret = 0;
- u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+ int ret = 0, flags = 0, done = 0, i;
+ u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+ u32 cluster_in_el;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_cached_dealloc_ctxt dealloc;
struct address_space *mapping = inode->i_mapping;
struct ocfs2_extent_tree et;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el = NULL;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
- ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
+ trace_ocfs2_remove_inode_range(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)byte_start,
+ (unsigned long long)byte_len);
+
if (byte_len == 0)
return 0;
@@ -1409,17 +1709,30 @@ static int ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
- trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
- if (trunc_len >= trunc_start)
- trunc_len -= trunc_start;
- else
- trunc_len = 0;
+ /*
+ * For reflinks, we may need to CoW 2 clusters which might be
+ * partially zero'd later, if hole's start and end offset were
+ * within one cluster(means is not exactly aligned to clustersize).
+ */
- mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)byte_start,
- (unsigned long long)byte_len, trunc_start, trunc_len);
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+ trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+ cluster_in_el = trunc_end;
ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
if (ret) {
@@ -1427,36 +1740,85 @@ static int ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- cpos = trunc_start;
- while (trunc_len) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
- &alloc_size, NULL);
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (trunc_end > trunc_start) {
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path,
+ cluster_in_el);
if (ret) {
mlog_errno(ret);
goto out;
}
- if (alloc_size > trunc_len)
- alloc_size = trunc_len;
+ el = path_leaf_el(path);
- /* Only do work for non-holes */
- if (phys_cpos != 0) {
- ret = ocfs2_remove_btree_range(inode, &et, cpos,
- phys_cpos, alloc_size,
- &dealloc);
+ i = ocfs2_find_rec(el, trunc_end);
+ /*
+ * Need to go to previous extent block.
+ */
+ if (i < 0) {
+ if (path->p_tree_depth == 0)
+ break;
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ path,
+ &cluster_in_el);
if (ret) {
mlog_errno(ret);
goto out;
}
+
+ /*
+ * We've reached the leftmost extent block,
+ * it's safe to leave.
+ */
+ if (cluster_in_el == 0)
+ break;
+
+ /*
+ * The 'pos' searched for previous extent block is
+ * always one cluster less than actual trunc_end.
+ */
+ trunc_end = cluster_in_el + 1;
+
+ ocfs2_reinit_path(path, 1);
+
+ continue;
+
+ } else
+ rec = &el->l_recs[i];
+
+ ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+ &trunc_len, &trunc_end, &blkno, &done);
+ if (done)
+ break;
+
+ flags = rec->e_flags;
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+ ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags,
+ &dealloc, refcount_loc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
- cpos += alloc_size;
- trunc_len -= alloc_size;
+ cluster_in_el = trunc_end;
+
+ ocfs2_reinit_path(path, 1);
}
ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
out:
+ ocfs2_free_path(path);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
@@ -1530,7 +1892,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
size = sr->l_start + sr->l_len;
- if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+ cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
goto out_inode_unlock;
@@ -1588,6 +1951,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ if (file && (file->f_flags & O_SYNC))
+ handle->h_sync = 1;
+
ocfs2_commit_trans(osb, handle);
out_inode_unlock:
@@ -1604,8 +1970,9 @@ out:
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
!ocfs2_writes_unwritten_extents(osb))
@@ -1620,44 +1987,131 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+ ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ mnt_drop_write_file(file);
+ return ret;
}
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_space_resv sr;
int change_size = 1;
+ int cmd = OCFS2_IOC_RESVSP64;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
- if (S_ISDIR(inode->i_mode))
- return -ENODEV;
-
if (mode & FALLOC_FL_KEEP_SIZE)
change_size = 0;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = OCFS2_IOC_UNRESVSP64;
+
sr.l_whence = 0;
sr.l_start = (s64)offset;
sr.l_len = (s64)len;
- return __ocfs2_change_file_space(NULL, inode, offset,
- OCFS2_IOC_RESVSP64, &sr, change_size);
+ return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+ change_size);
+}
+
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+ size_t count)
+{
+ int ret = 0;
+ unsigned int extent_flags;
+ u32 cpos, clusters, extent_len, phys_cpos;
+ struct super_block *sb = inode->i_sb;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
+ !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+ OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+ &extent_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = 1;
+ break;
+ }
+
+ if (extent_len > clusters)
+ extent_len = clusters;
+
+ clusters -= extent_len;
+ cpos += extent_len;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
}
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+ struct file *file,
+ loff_t pos, size_t count,
+ int *meta_level)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *meta_level = 1;
+
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(di_bh);
+ return ret;
+}
+
+static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t *ppos,
size_t count,
int appending,
- int *direct_io)
+ int *direct_io,
+ int *has_refcount)
{
int ret = 0, meta_level = 0;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
- loff_t saved_pos, end;
+ loff_t saved_pos = 0, end;
- /*
+ /*
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here.
*/
@@ -1674,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
* remove_suid() calls ->setattr without any hint that
* we may have already done our cluster locking. Since
* ocfs2_setattr() *must* take cluster locks to
- * proceeed, this will lead us to recursively lock the
+ * proceed, this will lead us to recursively lock the
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
@@ -1694,15 +2148,34 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
/* work on a copy of ppos until we're sure that we won't have
* to recalculate it due to relocking. */
- if (appending) {
+ if (appending)
saved_pos = i_size_read(inode);
- mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
- } else {
+ else
saved_pos = *ppos;
- }
end = saved_pos + count;
+ ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+ if (ret == 1) {
+ ocfs2_inode_unlock(inode, meta_level);
+ meta_level = -1;
+
+ ret = ocfs2_prepare_inode_for_refcount(inode,
+ file,
+ saved_pos,
+ count,
+ &meta_level);
+ if (has_refcount)
+ *has_refcount = 1;
+ if (direct_io)
+ *direct_io = 0;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
/*
* Skip the O_DIRECT checks if we don't need
* them.
@@ -1749,74 +2222,107 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
*ppos = saved_pos;
out_unlock:
- ocfs2_inode_unlock(inode, meta_level);
+ trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+ saved_pos, appending, count,
+ direct_io, has_refcount);
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
out:
return ret;
}
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
- int can_do_direct;
+ int can_do_direct, has_refcount = 0;
ssize_t written = 0;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
+ size_t count = iov_iter_count(from);
loff_t old_size, *ppos = &iocb->ki_pos;
u32 old_clusters;
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
- mlog_entry("(0x%p, %u, '%.*s')\n", file,
- (unsigned int)nr_segs,
- file->f_path.dentry->d_name.len,
- file->f_path.dentry->d_name.name);
+ trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ (unsigned int)from->nr_segs); /* GRRRRR */
- if (iocb->ki_left == 0)
+ if (iocb->ki_nbytes == 0)
return 0;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
+ ocfs2_iocb_clear_sem_locked(iocb);
+
relock:
- /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+ /* to match setattr's i_mutex -> rw_lock ordering */
if (direct_io) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ /* communicate with ocfs2_dio_end_io */
+ ocfs2_iocb_set_sem_locked(iocb);
}
- /* concurrent O_DIRECT writes are allowed */
- rw_level = !direct_io;
+ /*
+ * Concurrent O_DIRECT writes are allowed with
+ * mount_option "coherency=buffered".
+ */
+ rw_level = (!direct_io || full_coherency);
+
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
mlog_errno(ret);
goto out_sems;
}
+ /*
+ * O_DIRECT writes with "coherency=full" need to take EX cluster
+ * inode_lock to guarantee coherency.
+ */
+ if (direct_io && full_coherency) {
+ /*
+ * We need to take and drop the inode lock to force
+ * other nodes to drop their caches. Buffered I/O
+ * already does this in write_begin().
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ }
+
can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
- iocb->ki_left, appending,
- &can_do_direct);
+ ret = ocfs2_prepare_inode_for_write(file, ppos,
+ iocb->ki_nbytes, appending,
+ &can_do_direct, &has_refcount);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
+ *ppos);
+
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
*/
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
- up_read(&inode->i_alloc_sem);
have_alloc_sem = 0;
rw_level = -1;
@@ -1825,6 +2331,16 @@ relock:
goto relock;
}
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+ /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
+
/*
* To later detect whether a journal commit for sync writes is
* necessary, we sample i_size, and cluster count here.
@@ -1835,67 +2351,68 @@ relock:
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
- if (direct_io) {
- ret = generic_segment_checks(iov, &nr_segs, &ocount,
- VERIFY_READ);
- if (ret)
- goto out_dio;
-
- ret = generic_write_checks(file, ppos, &count,
- S_ISBLK(inode->i_mode));
- if (ret)
- goto out_dio;
+ ret = generic_write_checks(file, ppos, &count,
+ S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_dio;
- written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
- ppos, count, ocount);
+ iov_iter_truncate(from, count);
+ if (direct_io) {
+ written = generic_file_direct_write(iocb, from, *ppos);
if (written < 0) {
- /*
- * direct write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- * Don't need i_size_read because we hold i_mutex.
- */
- if (*ppos + count > inode->i_size)
- vmtruncate(inode, inode->i_size);
ret = written;
goto out_dio;
}
} else {
- written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
- *ppos);
+ current->backing_dev_info = file->f_mapping->backing_dev_info;
+ written = generic_perform_write(file, from, *ppos);
+ if (likely(written >= 0))
+ iocb->ki_pos = *ppos + written;
+ current->backing_dev_info = NULL;
}
out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
- if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
- /*
- * The generic write paths have handled getting data
- * to disk, but since we don't make use of the dirty
- * inode list, a manual journal commit is necessary
- * here.
- */
- if (old_size != i_size_read(inode) ||
- old_clusters != OCFS2_I(inode)->ip_clusters) {
+ if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
+ ((file->f_flags & O_DIRECT) && !direct_io)) {
+ ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
+ if (ret < 0)
+ written = ret;
+
+ if (!ret && ((old_size != i_size_read(inode)) ||
+ (old_clusters != OCFS2_I(inode)->ip_clusters) ||
+ has_refcount)) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
+
+ if (!ret)
+ ret = filemap_fdatawait_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
}
- /*
+ /*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock. (it's the clustered equivalent of
- * i_alloc_sem; protects truncate from racing with pending ios).
+ * it can unlock our rw lock.
* Unfortunately there are error cases which call end_io and others
* that don't. so we don't have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an
* error has already done it.
*/
- if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+ if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
+ }
+
+ if (unaligned_dio) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
out:
@@ -1904,109 +2421,12 @@ out:
out_sems:
if (have_alloc_sem)
- up_read(&inode->i_alloc_sem);
+ ocfs2_iocb_clear_sem_locked(iocb);
mutex_unlock(&inode->i_mutex);
- mlog_exit(ret);
- return written ? written : ret;
-}
-
-static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
- struct file *out,
- struct splice_desc *sd)
-{
- int ret;
-
- ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
- sd->total_len, 0, NULL);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- return splice_from_pipe_feed(pipe, sd, pipe_to_file);
-}
-
-static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
- int ret;
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- struct splice_desc sd = {
- .total_len = len,
- .flags = flags,
- .pos = *ppos,
- .u.file = out,
- };
-
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
- (unsigned int)len,
- out->f_path.dentry->d_name.len,
- out->f_path.dentry->d_name.name);
-
- if (pipe->inode)
- mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
-
- splice_from_pipe_begin(&sd);
- do {
- ret = splice_from_pipe_next(pipe, &sd);
- if (ret <= 0)
- break;
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0)
- mlog_errno(ret);
- else {
- ret = ocfs2_splice_to_file(pipe, out, &sd);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
- splice_from_pipe_end(pipe, &sd);
-
- if (pipe->inode)
- mutex_unlock(&pipe->inode->i_mutex);
-
- if (sd.num_spliced)
- ret = sd.num_spliced;
-
- if (ret > 0) {
- unsigned long nr_pages;
-
- *ppos += ret;
- nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- mutex_lock(&inode->i_mutex);
- err = ocfs2_rw_lock(inode, 1);
- if (err < 0) {
- mlog_errno(err);
- } else {
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
-
- if (err)
- ret = err;
- }
- balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
- }
-
- mlog_exit(ret);
+ if (written)
+ ret = written;
return ret;
}
@@ -2016,44 +2436,43 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
size_t len,
unsigned int flags)
{
- int ret = 0;
- struct inode *inode = in->f_path.dentry->d_inode;
+ int ret = 0, lock_level = 0;
+ struct inode *inode = file_inode(in);
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
- (unsigned int)len,
- in->f_path.dentry->d_name.len,
- in->f_path.dentry->d_name.name);
+ trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ in->f_path.dentry->d_name.len,
+ in->f_path.dentry->d_name.name, len);
/*
- * See the comment in ocfs2_file_aio_read()
+ * See the comment in ocfs2_file_read_iter()
*/
- ret = ocfs2_inode_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, lock_level);
ret = generic_file_splice_read(in, ppos, pipe, len, flags);
bail:
- mlog_exit(ret);
return ret;
}
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
{
int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
+
+ trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ filp->f_path.dentry->d_name.len,
+ filp->f_path.dentry->d_name.name,
+ to->nr_segs); /* GRRRRR */
- mlog_entry("(0x%p, %u, '%.*s')\n", filp,
- (unsigned int)nr_segs,
- filp->f_path.dentry->d_name.len,
- filp->f_path.dentry->d_name.name);
if (!inode) {
ret = -EINVAL;
@@ -2061,13 +2480,15 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
goto bail;
}
- /*
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ /*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
if (filp->f_flags & O_DIRECT) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ ocfs2_iocb_set_sem_locked(iocb);
ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
@@ -2083,26 +2504,25 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* We're fine letting folks race truncates and extending
* writes with read across the cluster, just like they can
* locally. Hence no rw_lock during read.
- *
+ *
* Take and drop the meta data lock to update inode fields
* like i_size. This allows the checks down below
- * generic_file_aio_read() a chance of actually working.
+ * generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
ocfs2_inode_unlock(inode, lock_level);
- ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
- if (ret == -EINVAL)
- mlog(0, "generic_file_aio_read returned -EINVAL\n");
+ ret = generic_file_read_iter(iocb, to);
+ trace_generic_file_aio_read_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
- /* see ocfs2_file_aio_write */
+ /* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
have_alloc_sem = 0;
@@ -2110,14 +2530,64 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
bail:
if (have_alloc_sem)
- up_read(&inode->i_alloc_sem);
- if (rw_level != -1)
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
- mlog_exit(ret);
return ret;
}
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (whence) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ /* SEEK_END requires the OCFS2 inode lock for the file
+ * because it references the file's size.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ offset += i_size_read(inode);
+ ocfs2_inode_unlock(inode, 0);
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2126,14 +2596,17 @@ const struct inode_operations ocfs2_file_iops = {
.getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
- .fallocate = ocfs2_fallocate,
.fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
/*
@@ -2141,15 +2614,15 @@ const struct inode_operations ocfs2_special_file_iops = {
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
*/
const struct file_operations ocfs2_fops = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .llseek = ocfs2_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
@@ -2157,13 +2630,14 @@ const struct file_operations ocfs2_fops = {
.lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
@@ -2188,28 +2662,29 @@ const struct file_operations ocfs2_dops = {
* the cluster.
*/
const struct file_operations ocfs2_fops_no_plocks = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .llseek = ocfs2_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops_no_plocks = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 172f9fbc9fc..97bf761c9e7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
- u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
@@ -69,4 +71,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr);
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+ size_t count);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db86..d8208b20dc5 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,10 +26,8 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -38,6 +36,7 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -67,7 +66,7 @@ void ocfs2_do_node_down(int node_num, void *data)
BUG_ON(osb->node_num == node_num);
- mlog(0, "ocfs2: node down event for %d\n", node_num);
+ trace_ocfs2_do_node_down(node_num);
if (!osb->cconn) {
/*
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 10e1fa87396..437de7f768c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,14 +25,12 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <asm/byteorder.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -53,6 +51,8 @@
#include "sysfile.h"
#include "uptodate.h"
#include "xattr.h"
+#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -130,8 +130,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
- mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno);
+ trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+ sysfile_type);
/* Ok. By now we've either got the offsets passed to us by the
* caller, or we just pulled them off the bh. Lets do some
@@ -152,27 +154,52 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
/* inode was *not* in the inode cache. 2.6.x requires
* us to do our own read_inode call and unlock it
* afterwards. */
- if (inode && inode->i_state & I_NEW) {
- mlog(0, "Inode was not in inode cache, reading it.\n");
- ocfs2_read_locked_inode(inode, &args);
- unlock_new_inode(inode);
- }
if (inode == NULL) {
inode = ERR_PTR(-ENOMEM);
mlog_errno(PTR_ERR(inode));
goto bail;
}
+ trace_ocfs2_iget5_locked(inode->i_state);
+ if (inode->i_state & I_NEW) {
+ ocfs2_read_locked_inode(inode, &args);
+ unlock_new_inode(inode);
+ }
if (is_bad_inode(inode)) {
iput(inode);
inode = ERR_PTR(-ESTALE);
goto bail;
}
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ oi->i_sync_tid = tid;
+ oi->i_datasync_tid = tid;
+ }
+
bail:
if (!IS_ERR(inode)) {
- mlog(0, "returning inode with number %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_exit_ptr(inode);
+ trace_ocfs2_iget_end(inode,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
}
return inode;
@@ -192,18 +219,17 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
int ret = 0;
- mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
-
args = opaque;
mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+ trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
+
if (oi->ip_blkno != args->fi_blkno)
goto bail;
ret = 1;
bail:
- mlog_exit(ret);
return ret;
}
@@ -215,16 +241,24 @@ bail:
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
{
struct ocfs2_find_inode_args *args = opaque;
-
- mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+ static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
+ ocfs2_file_ip_alloc_sem_key;
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
if (args->fi_sysfile_type != 0)
lockdep_set_class(&inode->i_mutex,
&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+ if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
+ args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
+ args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+ args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE)
+ lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+ &ocfs2_quota_ip_alloc_sem_key);
+ else
+ lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
+ &ocfs2_file_ip_alloc_sem_key);
- mlog_exit(0);
return 0;
}
@@ -235,9 +269,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
struct ocfs2_super *osb;
int use_plocks = 1;
- mlog_entry("(0x%p, size:%llu)\n", inode,
- (unsigned long long)le64_to_cpu(fe->i_size));
-
sb = inode->i_sb;
osb = OCFS2_SB(sb);
@@ -265,15 +296,17 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
inode->i_mode = le16_to_cpu(fe->i_mode);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
/* Fast symlinks will have i_size but no allocated clusters. */
- if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
inode->i_blocks = 0;
- else
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+ } else {
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mapping->a_ops = &ocfs2_aops;
+ inode->i_mapping->a_ops = &ocfs2_aops;
+ }
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -287,22 +320,22 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)le64_to_cpu(fe->i_blkno));
- inode->i_nlink = ocfs2_read_links_count(fe);
+ set_nlink(inode, ocfs2_read_links_count(fe));
+ trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+ le32_to_cpu(fe->i_flags));
if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
inode->i_flags |= S_NOQUOTA;
}
-
+
if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
- mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
inode->i_flags |= S_NOQUOTA;
} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
- mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
/* we can't actually hit this as read_inode can't
* handle superblocks today ;-) */
BUG();
@@ -324,12 +357,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
else
inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
+ OCFS2_I(inode)->ip_dir_lock_gen = 1;
break;
case S_IFLNK:
- if (ocfs2_inode_is_fast_symlink(inode))
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
- else
- inode->i_op = &ocfs2_symlink_inode_operations;
+ inode->i_op = &ocfs2_symlink_inode_operations;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
@@ -365,7 +396,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_I(inode)->ip_last_used_slot = 0;
OCFS2_I(inode)->ip_last_used_group = 0;
- mlog_exit_void();
+
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+ OCFS2_RESV_FLAG_DIR);
}
static int ocfs2_read_locked_inode(struct inode *inode,
@@ -378,22 +412,10 @@ static int ocfs2_read_locked_inode(struct inode *inode,
int status, can_lock;
u32 generation = 0;
- mlog_entry("(0x%p, 0x%p)\n", inode, args);
-
status = -EINVAL;
- if (inode == NULL || inode->i_sb == NULL) {
- mlog(ML_ERROR, "bad inode\n");
- return status;
- }
sb = inode->i_sb;
osb = OCFS2_SB(sb);
- if (!args) {
- mlog(ML_ERROR, "bad inode args\n");
- make_bad_inode(inode);
- return status;
- }
-
/*
* To improve performance of cold-cache inode stats, we take
* the cluster lock here if possible.
@@ -418,7 +440,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* #1 and #2 can be simply solved by never taking the lock
* here for system files (which are the only type we read
* during mount). It's a heavier approach, but our main
- * concern is user-accesible files anyway.
+ * concern is user-accessible files anyway.
*
* #3 works itself out because we'll eventually take the
* cluster lock before trusting anything anyway.
@@ -427,6 +449,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
+ trace_ocfs2_read_locked_inode(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
+
/*
* To maintain backwards compatibility with older versions of
* ocfs2-tools, we still store the generation value for system
@@ -463,7 +488,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
status = ocfs2_try_open_lock(inode, 0);
if (status) {
- make_bad_inode(inode);
+ make_bad_inode(inode);
return status;
}
}
@@ -473,7 +498,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
- if (!status)
+ /*
+ * If buffer is in jbd, then its checksum may not have been
+ * computed as yet.
+ */
+ if (!status && !buffer_jbd(bh))
status = ocfs2_validate_inode_block(osb->sb, bh);
}
if (status < 0) {
@@ -514,7 +543,6 @@ bail:
if (args && bh)
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -528,12 +556,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
- struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
handle_t *handle = NULL;
- mlog_entry_void();
-
fe = (struct ocfs2_dinode *) fe_bh->b_data;
/*
@@ -547,11 +572,13 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
+ handle = NULL;
mlog_errno(status);
goto out;
}
- status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -569,13 +596,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
ocfs2_commit_trans(osb, handle);
handle = NULL;
- status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
-
- status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -585,7 +606,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
out:
if (handle)
ocfs2_commit_trans(osb, handle);
- mlog_exit(status);
return status;
}
@@ -627,15 +647,17 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_unlock;
}
- status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
+ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+ orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
}
/* set the inodes dtime */
- status = ocfs2_journal_access_di(handle, inode, di_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -644,15 +666,10 @@ static int ocfs2_remove_inode(struct inode *inode,
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+ ocfs2_journal_dirty(handle, di_bh);
- status = ocfs2_journal_dirty(handle, di_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
-
- ocfs2_remove_from_cache(inode, di_bh);
- vfs_dq_free_inode(inode);
+ ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
+ dquot_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, di);
@@ -671,7 +688,7 @@ bail:
return status;
}
-/*
+/*
* Serialize with orphan dir recovery. If the process doing
* recovery on this orphan dir does an iget() with the dir
* i_mutex held, we'll deadlock here. Instead we detect this
@@ -684,8 +701,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
spin_lock(&osb->osb_lock);
if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
- mlog(0, "Recovery is happening on orphan dir %d, will skip "
- "this inode\n", slot);
ret = -EDEADLK;
goto out;
}
@@ -694,6 +709,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
osb->osb_orphan_wipes[slot]++;
out:
spin_unlock(&osb->osb_lock);
+ trace_ocfs2_check_orphan_recovery_state(slot, ret);
return ret;
}
@@ -710,38 +726,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
static int ocfs2_wipe_inode(struct inode *inode,
struct buffer_head *di_bh)
{
- int status, orphaned_slot;
+ int status, orphaned_slot = -1;
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *di;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
- di = (struct ocfs2_dinode *) di_bh->b_data;
- orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
- status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
- if (status)
- return status;
+ status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+ if (status)
+ return status;
- orphan_dir_inode = ocfs2_get_system_file_inode(osb,
- ORPHAN_DIR_SYSTEM_INODE,
- orphaned_slot);
- if (!orphan_dir_inode) {
- status = -EEXIST;
- mlog_errno(status);
- goto bail;
- }
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ orphaned_slot);
+ if (!orphan_dir_inode) {
+ status = -EEXIST;
+ mlog_errno(status);
+ goto bail;
+ }
- /* Lock the orphan dir. The lock will be held for the entire
- * delete_inode operation. We do this now to avoid races with
- * recovery completion on other nodes. */
- mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
- if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ /* Lock the orphan dir. The lock will be held for the entire
+ * delete_inode operation. We do this now to avoid races with
+ * recovery completion on other nodes. */
+ mutex_lock(&orphan_dir_inode->i_mutex);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
- mlog_errno(status);
- goto bail;
+ mlog_errno(status);
+ goto bail;
+ }
}
/* we do this while holding the orphan dir lock because we
@@ -770,12 +787,21 @@ static int ocfs2_wipe_inode(struct inode *inode,
goto bail_unlock_dir;
}
+ status = ocfs2_remove_refcount_tree(inode, di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+
status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
orphan_dir_bh);
if (status < 0)
mlog_errno(status);
bail_unlock_dir:
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+ return status;
+
ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
brelse(orphan_dir_bh);
@@ -794,6 +820,10 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+ (unsigned long long)oi->ip_blkno,
+ oi->ip_flags);
+
/* We shouldn't be getting here for the root directory
* inode.. */
if (inode == osb->root_inode) {
@@ -801,16 +831,15 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
- /* If we're coming from downconvert_thread we can't go into our own
- * voting [hello, deadlock city!], so unforuntately we just
- * have to skip deleting this guy. That's OK though because
- * the node who's doing the actual deleting should handle it
- * anyway. */
- if (current == osb->dc_task) {
- mlog(0, "Skipping delete of %lu because we're currently "
- "in downconvert\n", inode->i_ino);
+ /*
+ * If we're coming from downconvert_thread we can't go into our own
+ * voting [hello, deadlock city!] so we cannot delete the inode. But
+ * since we dropped last inode ref when downconverting dentry lock,
+ * we cannot have the file open and thus the node doing unlink will
+ * take care of deleting the inode.
+ */
+ if (current == osb->dc_task)
goto bail;
- }
spin_lock(&oi->ip_lock);
/* OCFS2 *never* deletes system files. This should technically
@@ -822,15 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
- /* If we have allowd wipe of this inode for another node, it
- * will be marked here so we can safely skip it. Recovery will
- * cleanup any inodes we might inadvertantly skip here. */
- if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
- mlog(0, "Skipping delete of %lu because another node "
- "has done this for us.\n", inode->i_ino);
- goto bail_unlock;
- }
-
ret = 1;
bail_unlock:
spin_unlock(&oi->ip_lock);
@@ -846,32 +866,45 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
struct buffer_head *di_bh,
int *wipe)
{
- int status = 0;
+ int status = 0, reason = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di;
*wipe = 0;
+ trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+ inode->i_nlink);
+
/* While we were waiting for the cluster lock in
* ocfs2_delete_inode, another node might have asked to delete
* the inode. Recheck our flags to catch this. */
if (!ocfs2_inode_is_valid_to_delete(inode)) {
- mlog(0, "Skipping delete of %llu because flags changed\n",
- (unsigned long long)oi->ip_blkno);
+ reason = 1;
goto bail;
}
/* Now that we have an up to date inode, we can double check
* the link count. */
- if (inode->i_nlink) {
- mlog(0, "Skipping delete of %llu because nlink = %u\n",
- (unsigned long long)oi->ip_blkno, inode->i_nlink);
+ if (inode->i_nlink)
goto bail;
- }
/* Do some basic inode verification... */
di = (struct ocfs2_dinode *) di_bh->b_data;
- if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+ if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+ !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ /*
+ * Inodes in the orphan dir must have ORPHANED_FL. The only
+ * inodes that come back out of the orphan dir are reflink
+ * targets. A reflink target may be moved out of the orphan
+ * dir between the time we scan the directory and the time we
+ * process it. This would lead to HAS_REFCOUNT_FL being set but
+ * ORPHANED_FL not.
+ */
+ if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+ reason = 2;
+ goto bail;
+ }
+
/* for lack of a better error? */
status = -EEXIST;
mlog(ML_ERROR,
@@ -896,7 +929,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
* the inode open lock in ocfs2_read_locked_inode(). When we
* get to ->delete_inode(), each node tries to convert it's
* lock to an exclusive. Trylocks are serialized by the inode
- * meta data lock. If the upconvert suceeds, we know the inode
+ * meta data lock. If the upconvert succeeds, we know the inode
* is no longer live and can be deleted.
*
* Though we call this with the meta data lock held, the
@@ -905,8 +938,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
status = ocfs2_try_open_lock(inode, 1);
if (status == -EAGAIN) {
status = 0;
- mlog(0, "Skipping delete of %llu because it is in use on "
- "other nodes\n", (unsigned long long)oi->ip_blkno);
+ reason = 3;
goto bail;
}
if (status < 0) {
@@ -915,11 +947,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
}
*wipe = 1;
- mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
- (unsigned long long)oi->ip_blkno,
- le16_to_cpu(di->i_orphaned_slot));
+ trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
bail:
+ trace_ocfs2_query_inode_wipe_end(status, reason);
return status;
}
@@ -929,28 +960,28 @@ bail:
static void ocfs2_cleanup_delete_inode(struct inode *inode,
int sync_data)
{
- mlog(0, "Cleanup inode %llu, sync = %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+ trace_ocfs2_cleanup_delete_inode(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
- write_inode_now(inode, 1);
- truncate_inode_pages(&inode->i_data, 0);
+ filemap_write_and_wait(inode->i_mapping);
+ truncate_inode_pages_final(&inode->i_data);
}
-void ocfs2_delete_inode(struct inode *inode)
+static void ocfs2_delete_inode(struct inode *inode)
{
int wipe, status;
- sigset_t blocked, oldset;
+ sigset_t oldset;
struct buffer_head *di_bh = NULL;
- mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+ trace_ocfs2_delete_inode(inode->i_ino,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ is_bad_inode(inode));
/* When we fail in read_inode() we mark inode as bad. The second test
* catches the case when inode allocation fails before allocating
* a block for inode. */
- if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
- mlog(0, "Skipping delete of bad inode\n");
+ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
goto bail;
- }
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
@@ -960,17 +991,13 @@ void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
/* We want to block signals in delete_inode as the lock and
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned
* forever. */
- sigfillset(&blocked);
- status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
- if (status < 0) {
- mlog_errno(status);
- ocfs2_cleanup_delete_inode(inode, 1);
- goto bail;
- }
+ ocfs2_block_signals(&oldset);
/*
* Synchronize us against ocfs2_get_dentry. We take this in
@@ -1044,39 +1071,39 @@ bail_unlock_nfs_sync:
ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
bail_unblock:
- status = sigprocmask(SIG_SETMASK, &oldset, NULL);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_unblock_signals(&oldset);
bail:
- clear_inode(inode);
- mlog_exit_void();
+ return;
}
-void ocfs2_clear_inode(struct inode *inode)
+static void ocfs2_clear_inode(struct inode *inode)
{
int status;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
- if (!inode)
- goto bail;
-
- mlog(0, "Clearing inode: %llu, nlink = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+ clear_inode(inode);
+ trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+ inode->i_nlink);
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ dquot_drop(inode);
+
/* To preven remote deletes we hold open lock before, now it
* is time to unlock PR and EX open locks. */
ocfs2_open_unlock(inode);
/* Do these before all the other work so that we don't bounce
* the downconvert thread while waiting to destroy the locks. */
- ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+ ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ &oi->ip_la_data_resv);
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@@ -1101,13 +1128,14 @@ void ocfs2_clear_inode(struct inode *inode)
ocfs2_lock_res_free(&oi->ip_inode_lockres);
ocfs2_lock_res_free(&oi->ip_open_lockres);
- ocfs2_metadata_cache_purge(inode);
+ ocfs2_metadata_cache_exit(INODE_CACHE(inode));
- mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
+ mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
"Clear inode of %llu, inode has %u cache items\n",
- (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
+ (unsigned long long)oi->ip_blkno,
+ INODE_CACHE(inode)->ci_num_cached);
- mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
+ mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
"Clear inode of %llu, inode has a bad flag\n",
(unsigned long long)oi->ip_blkno);
@@ -1134,9 +1162,7 @@ void ocfs2_clear_inode(struct inode *inode)
(unsigned long long)oi->ip_blkno, oi->ip_open_count);
/* Clear all other flags. */
- oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
- oi->ip_created_trans = 0;
- oi->ip_last_trans = 0;
+ oi->ip_flags = 0;
oi->ip_dir_start_lookup = 0;
oi->ip_blkno = 0ULL;
@@ -1147,29 +1173,36 @@ void ocfs2_clear_inode(struct inode *inode)
*/
jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
&oi->ip_jinode);
+}
-bail:
- mlog_exit_void();
+void ocfs2_evict_inode(struct inode *inode)
+{
+ if (!inode->i_nlink ||
+ (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+ ocfs2_delete_inode(inode);
+ } else {
+ truncate_inode_pages_final(&inode->i_data);
+ }
+ ocfs2_clear_inode(inode);
}
/* Called under inode_lock, with no more references on the
* struct inode, so it's safe here to check the flags field
* and to manipulate i_nlink without any other locks. */
-void ocfs2_drop_inode(struct inode *inode)
+int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ int res;
- mlog_entry_void();
-
- mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
- (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+ trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+ inode->i_nlink, oi->ip_flags);
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- generic_delete_inode(inode);
+ res = 1;
else
- generic_drop_inode(inode);
+ res = generic_drop_inode(inode);
- mlog_exit_void();
+ return res;
}
/*
@@ -1180,11 +1213,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int status = 0;
- mlog_entry("(inode = 0x%p, ino = %llu)\n", inode,
- inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+ trace_ocfs2_inode_revalidate(inode,
+ inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+ inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
if (!inode) {
- mlog(0, "eep, no inode!\n");
status = -ENOENT;
goto bail;
}
@@ -1192,7 +1225,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
spin_lock(&OCFS2_I(inode)->ip_lock);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
- mlog(0, "inode deleted!\n");
status = -ENOENT;
goto bail;
}
@@ -1208,8 +1240,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
}
ocfs2_inode_unlock(inode, 0);
bail:
- mlog_exit(status);
-
return status;
}
@@ -1225,10 +1255,9 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
int status;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
- mlog_entry("(inode %llu)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_journal_access_di(handle, inode, bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1244,8 +1273,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_size = cpu_to_le64(i_size_read(inode));
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
@@ -1254,14 +1283,9 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
-
- status = 0;
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
-
- mlog_exit(status);
return status;
}
@@ -1280,9 +1304,9 @@ void ocfs2_refresh_inode(struct inode *inode,
OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
ocfs2_set_inode_flags(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
- inode->i_nlink = ocfs2_read_links_count(fe);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ set_nlink(inode, ocfs2_read_links_count(fe));
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
inode->i_mode = le16_to_cpu(fe->i_mode);
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0;
@@ -1304,8 +1328,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
int rc;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
- mlog(0, "Validating dinode %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -1369,8 +1392,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
- flags, ocfs2_validate_inode_block);
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags, ocfs2_validate_inode_block);
/* If ocfs2_read_blocks() got us a new bh, pass it up. */
if (!rc && !*bh)
@@ -1383,3 +1406,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
{
return ocfs2_read_inode_block_full(inode, bh, 0);
}
+
+
+static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ return oi->ip_blkno;
+}
+
+static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ return oi->vfs_inode.i_sb;
+}
+
+static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ spin_lock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ spin_unlock(&oi->ip_lock);
+}
+
+static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ mutex_lock(&oi->ip_io_mutex);
+}
+
+static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
+
+ mutex_unlock(&oi->ip_io_mutex);
+}
+
+const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
+ .co_owner = ocfs2_inode_cache_owner,
+ .co_get_super = ocfs2_inode_cache_get_super,
+ .co_cache_lock = ocfs2_inode_cache_lock,
+ .co_cache_unlock = ocfs2_inode_cache_unlock,
+ .co_io_lock = ocfs2_inode_cache_io_lock,
+ .co_io_unlock = ocfs2_inode_cache_io_unlock,
+};
+
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ea71525aad4..a6c991c0fc9 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,39 +43,43 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
+ /* Number of outstanding AIO's which are not page aligned */
+ struct mutex ip_unaligned_aio;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
- u32 ip_clusters;
struct list_head ip_io_markers;
+ u32 ip_clusters;
+ u16 ip_dyn_features;
struct mutex ip_io_mutex;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
- u16 ip_dyn_features;
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
- u32 ip_dir_start_lookup;
-
- /* next two are protected by trans_inc_lock */
- /* which transaction were we created on? Zero if none. */
- unsigned long ip_created_trans;
- /* last transaction we were a part of. */
- unsigned long ip_last_trans;
-
struct ocfs2_caching_info ip_metadata_cache;
-
struct ocfs2_extent_map ip_extent_map;
-
struct inode vfs_inode;
struct jbd2_inode ip_jinode;
+ u32 ip_dir_start_lookup;
+
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
+ u32 ip_dir_lock_gen;
+
+ struct ocfs2_alloc_reservation ip_la_data_resv;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -87,8 +91,6 @@ struct ocfs2_inode_info
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE 0x00000010
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -103,11 +105,11 @@ struct ocfs2_inode_info
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
-#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
+#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
/* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT 0x00000040
-/* Indicates that the metadata cache should be used as an array. */
-#define OCFS2_INODE_CACHE_INLINE 0x00000080
+#define OCFS2_INODE_OPEN_DIRECT 0x00000020
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
@@ -120,10 +122,15 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
extern struct kmem_cache *ocfs2_inode_cache;
extern const struct address_space_operations ocfs2_aops;
+extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
-void ocfs2_clear_inode(struct inode *inode);
-void ocfs2_delete_inode(struct inode *inode);
-void ocfs2_drop_inode(struct inode *inode);
+static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
+{
+ return &OCFS2_I(inode)->ip_metadata_cache;
+}
+
+void ocfs2_evict_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
@@ -145,8 +152,6 @@ void ocfs2_refresh_inode(struct inode *inode,
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
struct buffer_head *ocfs2_bread(struct inode *inode,
int block, int *err, int reada);
@@ -172,4 +177,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
/* The same, but can be passed OCFS2_BH_* flags */
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags);
+
+static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
+{
+ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
+}
+
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 9fcd36dcc9a..6f66b3751ac 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,9 +7,9 @@
#include <linux/fs.h>
#include <linux/mount.h>
-#include <linux/smp_lock.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -22,8 +22,44 @@
#include "ocfs2_fs.h"
#include "ioctl.h"
#include "resize.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
-#include <linux/ext2_fs.h>
+#define o2info_from_user(a, b) \
+ copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b) \
+ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT. The error will be returned from the ioctl(2) call. It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
+ struct ocfs2_info_request __user *req)
+{
+ kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+ (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+ req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+ req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+ return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
@@ -38,7 +74,6 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
*flags = OCFS2_I(inode)->ip_attr;
ocfs2_inode_unlock(inode, 0);
- mlog_exit(status);
return status;
}
@@ -61,19 +96,12 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
}
status = -EACCES;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
goto bail_unlock;
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
- }
-
oldflags = ocfs2_inode->ip_attr;
flags = flags & mask;
flags |= oldflags & ~mask;
@@ -89,6 +117,13 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail_unlock;
}
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
+
ocfs2_inode->ip_attr = flags;
ocfs2_set_inode_flags(inode);
@@ -97,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
mlog_errno(status);
ocfs2_commit_trans(osb, handle);
+
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
@@ -104,18 +140,764 @@ bail:
brelse(bh);
- mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_blocksize oib;
+
+ if (o2info_from_user(oib, req))
+ goto bail;
+
+ oib.ib_blocksize = inode->i_sb->s_blocksize;
+
+ o2info_set_request_filled(&oib.ib_req);
+
+ if (o2info_to_user(oib, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oib.ib_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_clustersize oic;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oic, req))
+ goto bail;
+
+ oic.ic_clustersize = osb->s_clustersize;
+
+ o2info_set_request_filled(&oic.ic_req);
+
+ if (o2info_to_user(oic, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oic.ic_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_maxslots oim;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oim, req))
+ goto bail;
+
+ oim.im_max_slots = osb->max_slots;
+
+ o2info_set_request_filled(&oim.im_req);
+
+ if (o2info_to_user(oim, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oim.im_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_label oil;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oil, req))
+ goto bail;
+
+ memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+
+ o2info_set_request_filled(&oil.il_req);
+
+ if (o2info_to_user(oil, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oil.il_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_uuid oiu;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oiu, req))
+ goto bail;
+
+ memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+
+ o2info_set_request_filled(&oiu.iu_req);
+
+ if (o2info_to_user(oiu, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiu.iu_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_fs_features oif;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oif, req))
+ goto bail;
+
+ oif.if_compat_features = osb->s_feature_compat;
+ oif.if_incompat_features = osb->s_feature_incompat;
+ oif.if_ro_compat_features = osb->s_feature_ro_compat;
+
+ o2info_set_request_filled(&oif.if_req);
+
+ if (o2info_to_user(oif, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oif.if_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_journal_size oij;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oij, req))
+ goto bail;
+
+ oij.ij_journal_size = i_size_read(osb->journal->j_inode);
+
+ o2info_set_request_filled(&oij.ij_req);
+
+ if (o2info_to_user(oij, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oij.ij_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+ struct inode *inode_alloc, u64 blkno,
+ struct ocfs2_info_freeinode *fi,
+ u32 slot)
+{
+ int status = 0, unlock = 0;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *dinode_alloc = NULL;
+
+ if (inode_alloc)
+ mutex_lock(&inode_alloc->i_mutex);
+
+ if (o2info_coherent(&fi->ifi_req)) {
+ status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+ fi->ifi_stat[slot].lfi_total =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+ fi->ifi_stat[slot].lfi_free =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(inode_alloc, 0);
+
+ if (inode_alloc)
+ mutex_unlock(&inode_alloc->i_mutex);
+
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_freeinode(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u32 i;
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ struct ocfs2_info_freeinode *oifi = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *inode_alloc = NULL;
+
+ oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+ if (!oifi) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ if (o2info_from_user(*oifi, req))
+ goto bail;
+
+ oifi->ifi_slotnum = osb->max_slots;
+
+ for (i = 0; i < oifi->ifi_slotnum; i++) {
+ if (o2info_coherent(&oifi->ifi_req)) {
+ inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+ if (!inode_alloc) {
+ mlog(ML_ERROR, "unable to get alloc inode in "
+ "slot %u\n", i);
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+
+ iput(inode_alloc);
+ inode_alloc = NULL;
+
+ if (status < 0)
+ goto bail;
+ }
+
+ o2info_set_request_filled(&oifi->ifi_req);
+
+ if (o2info_to_user(*oifi, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oifi->ifi_req, req);
+
+ kfree(oifi);
+out_err:
+ return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+ unsigned int chunksize)
+{
+ int index;
+
+ index = __ilog2_u32(chunksize);
+ if (index >= OCFS2_INFO_MAX_HIST)
+ index = OCFS2_INFO_MAX_HIST - 1;
+
+ hist->fc_chunks[index]++;
+ hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+ unsigned int chunksize)
+{
+ if (chunksize > stats->ffs_max)
+ stats->ffs_max = chunksize;
+
+ if (chunksize < stats->ffs_min)
+ stats->ffs_min = chunksize;
+
+ stats->ffs_avg += chunksize;
+ stats->ffs_free_chunks_real++;
+}
+
+static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+ unsigned int chunksize)
+{
+ o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+ o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+ struct inode *gb_inode,
+ struct ocfs2_dinode *gb_dinode,
+ struct ocfs2_chain_rec *rec,
+ struct ocfs2_info_freefrag *ffg,
+ u32 chunks_in_group)
+{
+ int status = 0, used;
+ u64 blkno;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_group_desc *bg = NULL;
+
+ unsigned int max_bits, num_clusters;
+ unsigned int offset = 0, cluster, chunk;
+ unsigned int chunk_free, last_chunksize = 0;
+
+ if (!le32_to_cpu(rec->c_free))
+ goto bail;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (bh) {
+ brelse(bh);
+ bh = NULL;
+ }
+
+ if (o2info_coherent(&ffg->iff_req))
+ status = ocfs2_read_group_descriptor(gb_inode,
+ gb_dinode,
+ blkno, &bh);
+ else
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+ if (status < 0) {
+ mlog(ML_ERROR, "Can't read the group descriptor # "
+ "%llu from device.", (unsigned long long)blkno);
+ status = -EIO;
+ goto bail;
+ }
+
+ bg = (struct ocfs2_group_desc *)bh->b_data;
+
+ if (!le16_to_cpu(bg->bg_free_bits_count))
+ continue;
+
+ max_bits = le16_to_cpu(bg->bg_bits);
+ offset = 0;
+
+ for (chunk = 0; chunk < chunks_in_group; chunk++) {
+ /*
+ * last chunk may be not an entire one.
+ */
+ if ((offset + ffg->iff_chunksize) > max_bits)
+ num_clusters = max_bits - offset;
+ else
+ num_clusters = ffg->iff_chunksize;
+
+ chunk_free = 0;
+ for (cluster = 0; cluster < num_clusters; cluster++) {
+ used = ocfs2_test_bit(offset,
+ (unsigned long *)bg->bg_bitmap);
+ /*
+ * - chunk_free counts free clusters in #N chunk.
+ * - last_chunksize records the size(in) clusters
+ * for the last real free chunk being counted.
+ */
+ if (!used) {
+ last_chunksize++;
+ chunk_free++;
+ }
+
+ if (used && last_chunksize) {
+ ocfs2_info_update_ffg(ffg,
+ last_chunksize);
+ last_chunksize = 0;
+ }
+
+ offset++;
+ }
+
+ if (chunk_free == ffg->iff_chunksize)
+ ffg->iff_ffs.ffs_free_chunks++;
+ }
+
+ /*
+ * need to update the info for last free chunk.
+ */
+ if (last_chunksize)
+ ocfs2_info_update_ffg(ffg, last_chunksize);
+
+ } while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+ struct inode *gb_inode, u64 blkno,
+ struct ocfs2_info_freefrag *ffg)
+{
+ u32 chunks_in_group;
+ int status = 0, unlock = 0, i;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_chain_list *cl = NULL;
+ struct ocfs2_chain_rec *rec = NULL;
+ struct ocfs2_dinode *gb_dinode = NULL;
+
+ if (gb_inode)
+ mutex_lock(&gb_inode->i_mutex);
+
+ if (o2info_coherent(&ffg->iff_req)) {
+ status = ocfs2_inode_lock(gb_inode, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+ cl = &(gb_dinode->id2.i_chain);
+
+ /*
+ * Chunksize(in) clusters from userspace should be
+ * less than clusters in a group.
+ */
+ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+ ffg->iff_ffs.ffs_min = ~0U;
+ ffg->iff_ffs.ffs_clusters =
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+ ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+ chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+ rec = &(cl->cl_recs[i]);
+ status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+ gb_dinode,
+ rec, ffg,
+ chunks_in_group);
+ if (status)
+ goto bail;
+ }
+
+ if (ffg->iff_ffs.ffs_free_chunks_real)
+ ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+ ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(gb_inode, 0);
+
+ if (gb_inode)
+ mutex_unlock(&gb_inode->i_mutex);
+
+ if (gb_inode)
+ iput(gb_inode);
+
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_freefrag(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+ struct ocfs2_info_freefrag *oiff;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *gb_inode = NULL;
+
+ oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+ if (!oiff) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ if (o2info_from_user(*oiff, req))
+ goto bail;
+ /*
+ * chunksize from userspace should be power of 2.
+ */
+ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+ (!oiff->iff_chunksize)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (o2info_coherent(&oiff->iff_req)) {
+ gb_inode = ocfs2_get_system_file_inode(osb, type,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+ OCFS2_INVALID_SLOT);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+ if (status < 0)
+ goto bail;
+
+ o2info_set_request_filled(&oiff->iff_req);
+
+ if (o2info_to_user(*oiff, req)) {
+ status = -EFAULT;
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiff->iff_req, req);
+
+ kfree(oiff);
+out_err:
+ return status;
+}
+
+static int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ o2info_clear_request_filled(&oir);
+
+ if (o2info_to_user(oir, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oir, req);
+
+ return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+static int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ status = -EINVAL;
+ if (oir.ir_magic != OCFS2_INFO_MAGIC)
+ goto bail;
+
+ switch (oir.ir_code) {
+ case OCFS2_INFO_BLOCKSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+ status = ocfs2_info_handle_blocksize(inode, req);
+ break;
+ case OCFS2_INFO_CLUSTERSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+ status = ocfs2_info_handle_clustersize(inode, req);
+ break;
+ case OCFS2_INFO_MAXSLOTS:
+ if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+ status = ocfs2_info_handle_maxslots(inode, req);
+ break;
+ case OCFS2_INFO_LABEL:
+ if (oir.ir_size == sizeof(struct ocfs2_info_label))
+ status = ocfs2_info_handle_label(inode, req);
+ break;
+ case OCFS2_INFO_UUID:
+ if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+ status = ocfs2_info_handle_uuid(inode, req);
+ break;
+ case OCFS2_INFO_FS_FEATURES:
+ if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+ status = ocfs2_info_handle_fs_features(inode, req);
+ break;
+ case OCFS2_INFO_JOURNAL_SIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+ status = ocfs2_info_handle_journal_size(inode, req);
+ break;
+ case OCFS2_INFO_FREEINODE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+ status = ocfs2_info_handle_freeinode(inode, req);
+ break;
+ case OCFS2_INFO_FREEFRAG:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+ status = ocfs2_info_handle_freefrag(inode, req);
+ break;
+ default:
+ status = ocfs2_info_handle_unknown(inode, req);
+ break;
+ }
+
+bail:
+ return status;
+}
+
+static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
+{
+ int status = -EFAULT;
+ u64 __user *bp = NULL;
+
+ if (compat_flag) {
+#ifdef CONFIG_COMPAT
+ /*
+ * pointer bp stores the base address of a pointers array,
+ * which collects all addresses of separate request.
+ */
+ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+ BUG();
+#endif
+ } else
+ bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+ if (o2info_from_user(*req_addr, bp + idx))
+ goto bail;
+
+ status = 0;
+bail:
+ return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
+{
+ int i, status = 0;
+ u64 req_addr;
+ struct ocfs2_info_request __user *reqp;
+
+ if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+ (!info->oi_requests)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ for (i = 0; i < info->oi_count; i++) {
+
+ status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+ if (status)
+ break;
+
+ reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
+ if (!reqp) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_info_handle_request(inode, reqp);
+ if (status)
+ break;
+ }
+
+bail:
return status;
}
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
unsigned int flags;
int new_clusters;
int status;
struct ocfs2_space_resv sr;
struct ocfs2_new_group_input input;
+ struct reflink_arguments args;
+ const char __user *old_path;
+ const char __user *new_path;
+ bool preserve;
+ struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -129,12 +911,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- status = mnt_want_write(filp->f_path.mnt);
+ status = mnt_want_write_file(filp);
if (status)
return status;
status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
@@ -151,7 +933,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(new_clusters, (int __user *)arg))
return -EFAULT;
- return ocfs2_group_extend(inode, new_clusters);
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_extend(inode, new_clusters);
+ mnt_drop_write_file(filp);
+ return status;
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
if (!capable(CAP_SYS_RESOURCE))
@@ -160,7 +947,54 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
return -EFAULT;
- return ocfs2_group_add(inode, &input);
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_add(inode, &input);
+ mnt_drop_write_file(filp);
+ return status;
+ case OCFS2_IOC_REFLINK:
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ old_path = (const char __user *)(unsigned long)args.old_path;
+ new_path = (const char __user *)(unsigned long)args.new_path;
+ preserve = (args.preserve != 0);
+
+ return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 0);
+ case FITRIM:
+ {
+ struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen);
+ ret = ocfs2_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case OCFS2_IOC_MOVE_EXT:
+ return ocfs2_ioctl_move_extents(filp, argp);
default:
return -ENOTTY;
}
@@ -169,6 +1003,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
#ifdef CONFIG_COMPAT
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
+ bool preserve;
+ struct reflink_arguments args;
+ struct inode *inode = file_inode(file);
+ struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
+
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
cmd = OCFS2_IOC_GETFLAGS;
@@ -183,6 +1023,21 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ case FITRIM:
+ break;
+ case OCFS2_IOC_REFLINK:
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ preserve = (args.preserve != 0);
+
+ return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
+ compat_ptr(args.new_path), preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 1);
+ case OCFS2_IOC_MOVE_EXT:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fe..0cd5323bd3f 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
*
*/
-#ifndef OCFS2_IOCTL_H
-#define OCFS2_IOCTL_H
+#ifndef OCFS2_IOCTL_PROTO_H
+#define OCFS2_IOCTL_PROTO_H
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
-#endif /* OCFS2_IOCTL_H */
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1e37f..4b0c68849b3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,8 +28,10 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
+#include <linux/delay.h>
-#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -46,12 +48,16 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "uptodate.h"
#include "quota.h"
#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
DEFINE_SPINLOCK(trans_inc_lock);
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
int node_num, int slot_num);
@@ -296,19 +302,17 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
unsigned int flushed;
- unsigned long old_id;
struct ocfs2_journal *journal = NULL;
- mlog_entry_void();
-
journal = osb->journal;
/* Flush all pending commits and checkpoint the journal. */
down_write(&journal->j_trans_barrier);
- if (atomic_read(&journal->j_num_trans) == 0) {
+ flushed = atomic_read(&journal->j_num_trans);
+ trace_ocfs2_commit_cache_begin(flushed);
+ if (flushed == 0) {
up_write(&journal->j_trans_barrier);
- mlog(0, "No transactions for me to flush!\n");
goto finally;
}
@@ -321,25 +325,20 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- old_id = ocfs2_inc_trans_id(journal);
+ ocfs2_inc_trans_id(journal);
flushed = atomic_read(&journal->j_num_trans);
atomic_set(&journal->j_num_trans, 0);
up_write(&journal->j_trans_barrier);
- mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
- journal->j_trans_id, flushed);
+ trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
ocfs2_wake_downconvert_thread(osb);
wake_up(&journal->j_checkpointed);
finally:
- mlog_exit(status);
return status;
}
-/* pass it NULL and it will allocate a new handle object for you. If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{
journal_t *journal = osb->journal->j_journal;
@@ -357,11 +356,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
if (journal_current_handle())
return jbd2_journal_start(journal, max_buffs);
+ sb_start_intwrite(osb->sb);
+
down_read(&osb->journal->j_trans_barrier);
handle = jbd2_journal_start(journal, max_buffs);
if (IS_ERR(handle)) {
up_read(&osb->journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
mlog_errno(PTR_ERR(handle));
@@ -390,16 +392,16 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
if (ret < 0)
mlog_errno(ret);
- if (!nested)
+ if (!nested) {
up_read(&journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
+ }
return ret;
}
/*
- * 'nblocks' is what you want to add to the current
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
+ * 'nblocks' is what you want to add to the current transaction.
*
* This might call jbd2_journal_restart() which will commit dirty buffers
* and then restart the transaction. Before calling
@@ -417,14 +419,17 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
*/
int ocfs2_extend_trans(handle_t *handle, int nblocks)
{
- int status;
+ int status, old_nblocks;
BUG_ON(!handle);
- BUG_ON(!nblocks);
+ BUG_ON(nblocks < 0);
+
+ if (!nblocks)
+ return 0;
- mlog_entry_void();
+ old_nblocks = handle->h_buffer_credits;
- mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+ trace_ocfs2_extend_trans(old_nblocks, nblocks);
#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
@@ -437,10 +442,9 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
#endif
if (status > 0) {
- mlog(0,
- "jbd2_journal_extend failed, trying "
- "jbd2_journal_restart\n");
- status = jbd2_journal_restart(handle, nblocks);
+ trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
+ status = jbd2_journal_restart(handle,
+ old_nblocks + nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -449,11 +453,44 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
status = 0;
bail:
+ return status;
+}
+
+/*
+ * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for metadata modifications.
+ * Taken from Ext4: extend_or_restart_transaction()
+ */
+int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
+{
+ int status, old_nblks;
+
+ BUG_ON(!handle);
+
+ old_nblks = handle->h_buffer_credits;
+ trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
+
+ if (old_nblks < thresh)
+ return 0;
+
+ status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ if (status > 0) {
+ status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0)
+ mlog_errno(status);
+ }
- mlog_exit(status);
+bail:
return status;
}
+
struct ocfs2_triggers {
struct jbd2_buffer_trigger_type ot_triggers;
int ot_offset;
@@ -464,7 +501,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
return container_of(triggers, struct ocfs2_triggers, ot_triggers);
}
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -483,7 +520,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Quota blocks have their own trigger because the struct ocfs2_block_check
* offset depends on the blocksize.
*/
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -503,7 +540,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Directory blocks also have their own trigger because the
* struct ocfs2_block_check offset depends on the blocksize.
*/
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -536,7 +573,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
static struct ocfs2_triggers di_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dinode, i_check),
@@ -544,15 +581,23 @@ static struct ocfs2_triggers di_triggers = {
static struct ocfs2_triggers eb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_extent_block, h_check),
};
+static struct ocfs2_triggers rb_triggers = {
+ .ot_triggers = {
+ .t_frozen = ocfs2_frozen_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
static struct ocfs2_triggers gd_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
@@ -560,14 +605,14 @@ static struct ocfs2_triggers gd_triggers = {
static struct ocfs2_triggers db_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_db_commit_trigger,
+ .t_frozen = ocfs2_db_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers xb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -575,14 +620,14 @@ static struct ocfs2_triggers xb_triggers = {
static struct ocfs2_triggers dq_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_dq_commit_trigger,
+ .t_frozen = ocfs2_dq_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers dr_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -590,30 +635,29 @@ static struct ocfs2_triggers dr_triggers = {
static struct ocfs2_triggers dl_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
};
static int __ocfs2_journal_access(handle_t *handle,
- struct inode *inode,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh,
struct ocfs2_triggers *triggers,
int type)
{
int status;
+ struct ocfs2_super *osb =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
- BUG_ON(!inode);
+ BUG_ON(!ci || !ci->ci_ops);
BUG_ON(!handle);
BUG_ON(!bh);
- mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
- (unsigned long long)bh->b_blocknr, type,
- (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
- "OCFS2_JOURNAL_ACCESS_CREATE" :
- "OCFS2_JOURNAL_ACCESS_WRITE",
- bh->b_size);
+ trace_ocfs2_journal_access(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr, type, bh->b_size);
/* we can safely remove this assertion after testing. */
if (!buffer_uptodate(bh)) {
@@ -623,15 +667,15 @@ static int __ocfs2_journal_access(handle_t *handle,
BUG();
}
- /* Set the current transaction information on the inode so
+ /* Set the current transaction information on the ci so
* that the locking code knows whether it can drop it's locks
- * on this inode or not. We're protected from the commit
+ * on this ci or not. We're protected from the commit
* thread updating the current transaction id until
* ocfs2_commit_trans() because ocfs2_start_trans() took
* j_trans_barrier for us. */
- ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
+ ocfs2_set_ci_lock_trans(osb->journal, ci);
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
switch (type) {
case OCFS2_JOURNAL_ACCESS_CREATE:
case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -644,98 +688,88 @@ static int __ocfs2_journal_access(handle_t *handle,
default:
status = -EINVAL;
- mlog(ML_ERROR, "Uknown access type!\n");
+ mlog(ML_ERROR, "Unknown access type!\n");
}
- if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+ if (!status && ocfs2_meta_ecc(osb) && triggers)
jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ ocfs2_metadata_cache_io_unlock(ci);
if (status < 0)
mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
status, type);
- mlog_exit(status);
return status;
}
-int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, int type)
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
}
-int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
}
-int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+ return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
type);
}
-int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
}
-int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
}
-int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
}
-int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
}
-int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
- type);
+ return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
+}
+
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
}
-int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type)
{
- return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+ return __ocfs2_journal_access(handle, ci, bh, NULL, type);
}
-int ocfs2_journal_dirty(handle_t *handle,
- struct buffer_head *bh)
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
{
int status;
- mlog_entry("(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
status = jbd2_journal_dirty_metadata(handle, bh);
- if (status < 0)
- mlog(ML_ERROR, "Could not dirty metadata buffer. "
- "(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
-
- mlog_exit(status);
- return status;
+ BUG_ON(status);
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -748,13 +782,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
if (osb->osb_commit_interval)
commit_interval = osb->osb_commit_interval;
- spin_lock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- spin_unlock(&journal->j_state_lock);
+ write_unlock(&journal->j_state_lock);
}
int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -767,8 +801,6 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
struct ocfs2_super *osb;
int inode_lock = 0;
- mlog_entry_void();
-
BUG_ON(!journal);
osb = journal->j_osb;
@@ -805,17 +837,16 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
inode_lock = 1;
di = (struct ocfs2_dinode *)bh->b_data;
- if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
+ if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) {
mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
- inode->i_size);
+ i_size_read(inode));
status = -EINVAL;
goto done;
}
- mlog(0, "inode->i_size = %lld\n", inode->i_size);
- mlog(0, "inode->i_blocks = %llu\n",
- (unsigned long long)inode->i_blocks);
- mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+ trace_ocfs2_journal_init(i_size_read(inode),
+ (unsigned long long)inode->i_blocks,
+ OCFS2_I(inode)->ip_clusters);
/* call the kernels journal init function now */
j_journal = jbd2_journal_init_inode(inode);
@@ -825,8 +856,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
goto done;
}
- mlog(0, "Returned from jbd2_journal_init_inode\n");
- mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+ trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL);
@@ -851,7 +881,6 @@ done:
}
}
- mlog_exit(status);
return status;
}
@@ -874,8 +903,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
struct buffer_head *bh = journal->j_bh;
struct ocfs2_dinode *fe;
- mlog_entry_void();
-
fe = (struct ocfs2_dinode *)bh->b_data;
/* The journal bh on the osb always comes from ocfs2_journal_init()
@@ -894,11 +921,10 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
ocfs2_bump_recovery_generation(fe);
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
- status = ocfs2_write_block(osb, bh, journal->j_inode);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
if (status < 0)
mlog_errno(status);
- mlog_exit(status);
return status;
}
@@ -913,8 +939,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
struct inode *inode = NULL;
int num_running_trans = 0;
- mlog_entry_void();
-
BUG_ON(!osb);
journal = osb->journal;
@@ -931,10 +955,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
BUG();
num_running_trans = atomic_read(&(osb->journal->j_num_trans));
- if (num_running_trans > 0)
- mlog(0, "Shutting down journal: must wait on %d "
- "running transactions!\n",
- num_running_trans);
+ trace_ocfs2_journal_shutdown(num_running_trans);
/* Do a commit_cache here. It will flush our journal, *and*
* release any locks that are still held.
@@ -947,7 +968,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
* completely destroy the journal. */
if (osb->commit_task) {
/* Wait for the commit thread */
- mlog(0, "Waiting for ocfs2commit to exit....\n");
+ trace_ocfs2_journal_shutdown_wait(osb->commit_task);
kthread_stop(osb->commit_task);
osb->commit_task = NULL;
}
@@ -990,7 +1011,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
done:
if (inode)
iput(inode);
- mlog_exit_void();
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1016,8 +1036,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
int status = 0;
struct ocfs2_super *osb;
- mlog_entry_void();
-
BUG_ON(!journal);
osb = journal->j_osb;
@@ -1051,7 +1069,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
osb->commit_task = NULL;
done:
- mlog_exit(status);
return status;
}
@@ -1062,8 +1079,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
{
int status;
- mlog_entry_void();
-
BUG_ON(!journal);
status = jbd2_journal_wipe(journal->j_journal, full);
@@ -1077,7 +1092,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
mlog_errno(status);
bail:
- mlog_exit(status);
return status;
}
@@ -1116,11 +1130,9 @@ static int ocfs2_force_read_journal(struct inode *inode)
#define CONCURRENT_JOURNAL_FILL 32ULL
struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
- mlog_entry_void();
-
memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
- num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
+ num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
v_blkno = 0;
while (v_blkno < num_blocks) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
@@ -1153,7 +1165,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
bail:
for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
brelse(bhs[i]);
- mlog_exit(status);
return status;
}
@@ -1177,7 +1188,7 @@ struct ocfs2_la_recovery_item {
*/
void ocfs2_complete_recovery(struct work_struct *work)
{
- int ret;
+ int ret = 0;
struct ocfs2_journal *journal =
container_of(work, struct ocfs2_journal, j_recovery_work);
struct ocfs2_super *osb = journal->j_osb;
@@ -1186,9 +1197,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_quota_recovery *qrec;
LIST_HEAD(tmp_la_list);
- mlog_entry_void();
-
- mlog(0, "completing recovery from keventd\n");
+ trace_ocfs2_complete_recovery(
+ (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
spin_lock(&journal->j_lock);
list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
@@ -1197,15 +1207,18 @@ void ocfs2_complete_recovery(struct work_struct *work)
list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
list_del_init(&item->lri_list);
- mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
-
ocfs2_wait_on_quotas(osb);
la_dinode = item->lri_la_dinode;
- if (la_dinode) {
- mlog(0, "Clean up local alloc %llu\n",
- (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
+ tl_dinode = item->lri_tl_dinode;
+ qrec = item->lri_qrec;
+ trace_ocfs2_complete_recovery_slot(item->lri_slot,
+ la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+ tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+ qrec);
+
+ if (la_dinode) {
ret = ocfs2_complete_local_alloc_recovery(osb,
la_dinode);
if (ret < 0)
@@ -1214,11 +1227,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(la_dinode);
}
- tl_dinode = item->lri_tl_dinode;
if (tl_dinode) {
- mlog(0, "Clean up truncate log %llu\n",
- (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
-
ret = ocfs2_complete_truncate_log_recovery(osb,
tl_dinode);
if (ret < 0)
@@ -1231,9 +1240,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
if (ret < 0)
mlog_errno(ret);
- qrec = item->lri_qrec;
if (qrec) {
- mlog(0, "Recovering quota files");
ret = ocfs2_finish_quota_recovery(osb, qrec,
item->lri_slot);
if (ret < 0)
@@ -1244,8 +1251,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(item);
}
- mlog(0, "Recovery completion\n");
- mlog_exit_void();
+ trace_ocfs2_complete_recovery_end(ret);
}
/* NOTE: This function always eats your references to la_dinode and
@@ -1264,11 +1270,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
/* Though we wish to avoid it, we are in fact safe in
* skipping local alloc cleanup as fsck.ocfs2 is more
* than capable of reclaiming unused space. */
- if (la_dinode)
- kfree(la_dinode);
-
- if (tl_dinode)
- kfree(tl_dinode);
+ kfree(la_dinode);
+ kfree(tl_dinode);
if (qrec)
ocfs2_free_quota_recovery(qrec);
@@ -1295,6 +1298,9 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
{
struct ocfs2_journal *journal = osb->journal;
+ if (ocfs2_is_hard_readonly(osb))
+ return;
+
/* No need to queue up our truncate_log as regular cleanup will catch
* that */
ocfs2_queue_recovery_completion(journal, osb->slot_num,
@@ -1331,8 +1337,6 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
- mlog_entry_void();
-
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
@@ -1364,15 +1368,12 @@ restart:
* clear it until ocfs2_recover_node() has succeeded. */
node_num = rm->rm_entries[0];
spin_unlock(&osb->osb_lock);
- mlog(0, "checking node %d\n", node_num);
slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ trace_ocfs2_recovery_thread_node(node_num, slot_num);
if (slot_num == -ENOENT) {
status = 0;
- mlog(0, "no slot for this node, so no recovery"
- "required.\n");
goto skip_recovery;
}
- mlog(0, "node %d was using slot %d\n", node_num, slot_num);
/* It is a bit subtle with quota recovery. We cannot do it
* immediately because we have to obtain cluster locks from
@@ -1399,7 +1400,7 @@ skip_recovery:
spin_lock(&osb->osb_lock);
}
spin_unlock(&osb->osb_lock);
- mlog(0, "All nodes recovered\n");
+ trace_ocfs2_recovery_thread_end(status);
/* Refresh all journal recovery generations from disk */
status = ocfs2_check_journals_nolocks(osb);
@@ -1408,7 +1409,7 @@ skip_recovery:
mlog_errno(status);
/* Now it is right time to recover quotas... We have to do this under
- * superblock lock so that noone can start using the slot (and crash)
+ * superblock lock so that no one can start using the slot (and crash)
* before we recover it */
for (i = 0; i < rm_quota_used; i++) {
qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
@@ -1440,10 +1441,8 @@ bail:
mutex_unlock(&osb->recovery_lock);
- if (rm_quota)
- kfree(rm_quota);
+ kfree(rm_quota);
- mlog_exit(status);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
* complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1453,19 +1452,15 @@ bail:
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{
- mlog_entry("(node_num=%d, osb->node_num = %d)\n",
- node_num, osb->node_num);
-
mutex_lock(&osb->recovery_lock);
- if (osb->disable_recovery)
- goto out;
- /* People waiting on recovery will wait on
- * the recovery map to empty. */
- if (ocfs2_recovery_map_set(osb, node_num))
- mlog(0, "node %d already in recovery map.\n", node_num);
+ trace_ocfs2_recovery_thread(node_num, osb->node_num,
+ osb->disable_recovery, osb->recovery_thread_task,
+ osb->disable_recovery ?
+ -1 : ocfs2_recovery_map_set(osb, node_num));
- mlog(0, "starting recovery thread...\n");
+ if (osb->disable_recovery)
+ goto out;
if (osb->recovery_thread_task)
goto out;
@@ -1480,8 +1475,6 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
out:
mutex_unlock(&osb->recovery_lock);
wake_up(&osb->recovery_event);
-
- mlog_exit_void();
}
static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
@@ -1555,7 +1548,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
* If not, it needs recovery.
*/
if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
- mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+ trace_ocfs2_replay_journal_recovered(slot_num,
osb->slot_recovery_generations[slot_num], slot_reco_gen);
osb->slot_recovery_generations[slot_num] = slot_reco_gen;
status = -EBUSY;
@@ -1566,7 +1559,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
- mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
+ trace_ocfs2_replay_journal_lock_err(status);
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not lock journal!\n");
goto done;
@@ -1579,7 +1572,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
slot_reco_gen = ocfs2_get_recovery_generation(fe);
if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
- mlog(0, "No recovery required for node %d\n", node_num);
+ trace_ocfs2_replay_journal_skip(node_num);
/* Refresh recovery generation for the slot */
osb->slot_recovery_generations[slot_num] = slot_reco_gen;
goto done;
@@ -1588,9 +1581,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* we need to run complete recovery for offline orphan slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
- node_num, slot_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1600,7 +1593,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
goto done;
}
- mlog(0, "calling journal_init_inode\n");
journal = jbd2_journal_init_inode(inode);
if (journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
@@ -1620,7 +1612,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
ocfs2_clear_journal_error(osb->sb, journal, slot_num);
/* wipe the journal */
- mlog(0, "flushing the journal.\n");
jbd2_journal_lock_updates(journal);
status = jbd2_journal_flush(journal);
jbd2_journal_unlock_updates(journal);
@@ -1638,7 +1629,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
ocfs2_get_recovery_generation(fe);
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
- status = ocfs2_write_block(osb, bh, inode);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
if (status < 0)
mlog_errno(status);
@@ -1647,6 +1638,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
jbd2_journal_destroy(journal);
+ printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
done:
/* drop the lock on this nodes journal */
if (got_lock)
@@ -1657,7 +1651,6 @@ done:
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -1680,8 +1673,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;
- mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
- node_num, slot_num, osb->node_num);
+ trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
/* Should not ever be called to recover ourselves -- in that
* case we should've called ocfs2_journal_load instead. */
@@ -1690,9 +1682,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
if (status == -EBUSY) {
- mlog(0, "Skipping recovery for slot %u (node %u) "
- "as another node has recovered it\n", slot_num,
- node_num);
+ trace_ocfs2_recover_node_skip(slot_num, node_num);
status = 0;
goto done;
}
@@ -1727,7 +1717,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
status = 0;
done:
- mlog_exit(status);
return status;
}
@@ -1800,8 +1789,8 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
osb->slot_recovery_generations[i] = gen;
- mlog(0, "Slot %u recovery generation is %u\n", i,
- osb->slot_recovery_generations[i]);
+ trace_ocfs2_mark_dead_nodes(i,
+ osb->slot_recovery_generations[i]);
if (i == osb->slot_num) {
spin_unlock(&osb->osb_lock);
@@ -1837,11 +1826,158 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
status = 0;
bail:
- mlog_exit(status);
return status;
}
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+ unsigned long time;
+
+ get_random_bytes(&time, sizeof(time));
+ time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+ return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ * Node 1 has an inode it was using. The dentry went away due to memory
+ * pressure. Node 1 closes the inode, but it's on the free list. The node
+ * has the open lock.
+ * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ * but node 1 has no dentry and doesn't get the message. It trylocks the
+ * open lock, sees that another node has a PR, and does nothing.
+ * Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ * open lock, sees the PR still, and does nothing.
+ * Basically, we have to trigger an orphan iput on node 1. The only way
+ * for this to happen is if node 1 runs node 2's orphan dir.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds. It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan. This node skips the scan and tracks the
+ * sequence number. If the sequence number didn't change, it means a scan
+ * hasn't happened. The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+ int status, i;
+ u32 seqno = 0;
+
+ os = &osb->osb_orphan_scan;
+
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+ goto out;
+
+ trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+ atomic_read(&os->os_state));
+
+ status = ocfs2_orphan_scan_lock(osb, &seqno);
+ if (status < 0) {
+ if (status != -EAGAIN)
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Do no queue the tasks if the volume is being umounted */
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+ goto unlock;
+
+ if (os->os_seqno != seqno) {
+ os->os_seqno = seqno;
+ goto unlock;
+ }
+
+ for (i = 0; i < osb->max_slots; i++)
+ ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+ NULL);
+ /*
+ * We queued a recovery on orphan slots, increment the sequence
+ * number and update LVB so other node will skip the scan for a while
+ */
+ seqno++;
+ os->os_count++;
+ os->os_scantime = CURRENT_TIME;
+unlock:
+ ocfs2_orphan_scan_unlock(osb, seqno);
+out:
+ trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+ atomic_read(&os->os_state));
+ return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+ struct ocfs2_orphan_scan *os;
+ struct ocfs2_super *osb;
+
+ os = container_of(work, struct ocfs2_orphan_scan,
+ os_orphan_scan_work.work);
+ osb = os->os_osb;
+
+ mutex_lock(&os->os_lock);
+ ocfs2_queue_orphan_scan(osb);
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) {
+ atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+ mutex_lock(&os->os_lock);
+ cancel_delayed_work(&os->os_orphan_scan_work);
+ mutex_unlock(&os->os_lock);
+ }
+}
+
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_osb = osb;
+ os->os_count = 0;
+ os->os_seqno = 0;
+ mutex_init(&os->os_lock);
+ INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_scantime = CURRENT_TIME;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
+ atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
+ else {
+ atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
+ }
+}
+
struct ocfs2_orphan_filldir_priv {
+ struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
};
@@ -1863,8 +1999,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
if (IS_ERR(iter))
return 0;
- mlog(0, "queue orphan %llu\n",
- (unsigned long long)OCFS2_I(iter)->ip_blkno);
+ trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
/* No locking is required for the next_orphan queue as there
* is only ever a single process doing orphan recovery. */
OCFS2_I(iter)->ip_next_orphan = p->head;
@@ -1879,11 +2014,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
{
int status;
struct inode *orphan_dir_inode = NULL;
- struct ocfs2_orphan_filldir_priv priv;
- loff_t pos = 0;
-
- priv.osb = osb;
- priv.head = *head;
+ struct ocfs2_orphan_filldir_priv priv = {
+ .ctx.actor = ocfs2_orphan_filldir,
+ .osb = osb,
+ .head = *head
+ };
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
@@ -1892,7 +2027,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
status = -ENOENT;
mlog_errno(status);
return status;
- }
+ }
mutex_lock(&orphan_dir_inode->i_mutex);
status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
@@ -1901,8 +2036,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
- ocfs2_orphan_filldir);
+ status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
if (status) {
mlog_errno(status);
goto out_cluster;
@@ -1980,7 +2114,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
struct inode *iter;
struct ocfs2_inode_info *oi;
- mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+ trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -1993,17 +2127,12 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
while (inode) {
oi = OCFS2_I(inode);
- mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
+ trace_ocfs2_recover_orphans_iput(
+ (unsigned long long)oi->ip_blkno);
iter = oi->ip_next_orphan;
spin_lock(&oi->ip_lock);
- /* The remote delete code may have set these on the
- * assumption that the other node would wipe them
- * successfully. If they are still in the node's
- * orphan dir, we need to reset that state. */
- oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
-
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
@@ -2031,6 +2160,7 @@ static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
* MOUNTED flag, but this is set right before
* dismount_volume() so we can trust it. */
if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+ trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
mlog(0, "mount error, exiting!\n");
return -EBUSY;
}
@@ -2056,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)
|| kthread_should_stop());
status = ocfs2_commit_cache(osb);
- if (status < 0)
- mlog_errno(status);
+ if (status < 0) {
+ static unsigned long abort_warn_time;
+
+ /* Warn about this once per minute */
+ if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+ mlog(ML_ERROR, "status = %d, journal is "
+ "already aborted.\n", status);
+ /*
+ * After ocfs2_commit_cache() fails, j_num_trans has a
+ * non-zero value. Sleep here to avoid a busy-wait
+ * loop.
+ */
+ msleep_interruptible(1000);
+ }
if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
mlog(ML_KTHREAD,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index eb7b76331eb..7f8cde94abf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
+ spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
- spinlock_t j_lock;
+ /* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
@@ -90,60 +91,75 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
return old_id;
}
-static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
- struct inode *inode)
+static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
+ struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
- OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
+ ci->ci_last_trans = journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Used to figure out whether it's safe to drop a metadata lock on an
- * inode. Returns true if all the inodes changes have been
+ * cached object. Returns true if all the object's changes have been
* checkpointed to disk. You should be holding the spinlock on the
* metadata lock while calling this to be sure that nobody can take
* the lock and put it on another transaction. */
-static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
+static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
{
int ret;
- struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
+ struct ocfs2_journal *journal =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
spin_lock(&trans_inc_lock);
- ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
+ ret = time_after(journal->j_trans_id, ci->ci_last_trans);
spin_unlock(&trans_inc_lock);
return ret;
}
-/* convenience function to check if an inode is still new (has never
- * hit disk) Will do you a favor and set created_trans = 0 when you've
- * been checkpointed. returns '1' if the inode is still new. */
-static inline int ocfs2_inode_is_new(struct inode *inode)
+/* convenience function to check if an object backed by struct
+ * ocfs2_caching_info is still new (has never hit disk) Will do you a
+ * favor and set created_trans = 0 when you've
+ * been checkpointed. returns '1' if the ci is still new. */
+static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
{
int ret;
+ struct ocfs2_journal *journal =
+ OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
+
+ spin_lock(&trans_inc_lock);
+ ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
+ if (!ret)
+ ci->ci_created_trans = 0;
+ spin_unlock(&trans_inc_lock);
+ return ret;
+}
+/* Wrapper for inodes so we can check system files */
+static inline int ocfs2_inode_is_new(struct inode *inode)
+{
/* System files are never "new" as they're written out by
* mkfs. This helps us early during mount, before we have the
* journal open and j_trans_id could be junk. */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
return 0;
- spin_lock(&trans_inc_lock);
- ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
- OCFS2_I(inode)->ip_created_trans));
- if (!ret)
- OCFS2_I(inode)->ip_created_trans = 0;
- spin_unlock(&trans_inc_lock);
- return ret;
+
+ return ocfs2_ci_is_new(INODE_CACHE(inode));
}
-static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
- struct inode *inode)
+static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
+ struct ocfs2_caching_info *ci)
{
spin_lock(&trans_inc_lock);
- OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
+ ci->ci_created_trans = osb->journal->j_trans_id;
spin_unlock(&trans_inc_lock);
}
/* Exported only for the journal struct init code in super.c. Do not call. */
+void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
@@ -184,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
- atomic_set(&osb->needs_checkpoint, 1);
wake_up(&osb->checkpoint_event);
}
@@ -195,17 +210,17 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
if (ocfs2_mount_local(osb))
return;
- if (!ocfs2_inode_fully_checkpointed(inode)) {
+ if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
/* WARNING: This only kicks off a single
* checkpoint. If someone races you and adds more
* metadata to the journal, you won't know, and will
- * wind up waiting *alot* longer than necessary. Right
+ * wind up waiting *a lot* longer than necessary. Right
* now we only use this in clear_inode so that's
* OK. */
ocfs2_start_checkpoint(osb);
wait_event(osb->journal->j_checkpointed,
- ocfs2_inode_fully_checkpointed(inode));
+ ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
}
}
@@ -243,6 +258,17 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_allocate_extend_trans(handle_t *handle,
+ int thresh);
+
+/*
+ * Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * fallocate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go.
+ */
+#define OCFS2_MAX_TRANS_DATA 64U
/*
* Create access is for when we get a newly created buffer and we're
@@ -261,31 +287,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
/* ocfs2_inode */
-int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_extent_block */
-int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
+ struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_group_desc */
-int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_xattr_block */
-int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* quota blocks */
-int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* dirblock */
-int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_dx_root_block */
-int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* ocfs2_dx_leaf */
-int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/* Anything that has no ecc */
-int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
/*
@@ -307,8 +336,7 @@ int ocfs2_journal_access(handle_t *handle, struct inode *inode,
* <modify the bh>
* ocfs2_journal_dirty(handle, bh);
*/
-int ocfs2_journal_dirty(handle_t *handle,
- struct buffer_head *bh);
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
/*
* Credit Macros:
@@ -325,20 +353,27 @@ int ocfs2_journal_dirty(handle_t *handle,
/* extended attribute block update */
#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
/* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
/*
* The two writes below can accidentally see global info dirty due
* to set_info() quotactl so make them prepared for the writes.
*/
/* quota data block, global info */
/* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
/* global quota data block, local quota data block, global quota inode,
* global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
static inline int ocfs2_quota_trans_credits(struct super_block *sb)
{
@@ -351,11 +386,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
return credits;
}
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
-
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -385,9 +415,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
ocfs2_quota_trans_credits(sb);
}
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
{
@@ -421,10 +451,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
@@ -470,14 +501,30 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
return credits;
}
+/* inode update, new refcount block and its allocation credits. */
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+ + OCFS2_SUBALLOC_ALLOC)
+
+/* inode and the refcount block update. */
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * inode and the refcount block update.
+ * It doesn't include the credits for sub alloc change.
+ * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
+ */
+#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
/*
* Please note that the caller must make sure that root_el is the root
* of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
* the result may be wrong.
*/
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_extent_list *root_el,
- u32 bits_wanted)
+ struct ocfs2_extent_list *root_el)
{
int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
@@ -525,6 +572,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
return blocks;
}
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list. The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits(). They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+ return ocfs2_extent_recs_per_gd(sb);
+}
+
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
unsigned int clusters_to_del,
struct ocfs2_dinode *fe,
@@ -567,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
new_size);
}
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index bac7e6abaf4..04401345562 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,7 +29,6 @@
#include <linux/highmem.h>
#include <linux/bitops.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -43,6 +42,7 @@
#include "suballoc.h"
#include "super.h"
#include "sysfile.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc,
- u32 numbits);
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv);
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,150 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ * group descriptors for other allocations (such as block groups,
+ * etc). Picking default sizes which are a multiple of 4 could help
+ * - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ * file systems. This can easily be taken care of by limiting our
+ * default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ * particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K group: 126M la: 121M
+ * csize: 8K group: 252M la: 243M
+ * csize: 16K group: 504M la: 486M
+ * csize: 32K group: 1008M la: 972M
+ * csize: 64K group: 2016M la: 1944M
+ * csize: 128K group: 4032M la: 3888M
+ * csize: 256K group: 8064M la: 7776M
+ * csize: 512K group: 16128M la: 15552M
+ * csize: 1024K group: 32256M la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT 8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+ unsigned int la_mb;
+ unsigned int gd_mb;
+ unsigned int la_max_mb;
+ unsigned int megs_per_slot;
+ struct super_block *sb = osb->sb;
+
+ gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+ /*
+ * This takes care of files systems with very small group
+ * descriptors - 512 byte blocksize at cluster sizes lower
+ * than 16K and also 1k blocksize with 4k cluster size.
+ */
+ if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+ || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+ return OCFS2_LA_OLD_DEFAULT;
+
+ /*
+ * Leave enough room for some block groups and make the final
+ * value we work from a multiple of 4.
+ */
+ gd_mb -= 16;
+ gd_mb &= 0xFFFFFFFB;
+
+ la_mb = gd_mb;
+
+ /*
+ * Keep window sizes down to a reasonable default
+ */
+ if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+ /*
+ * Some clustersize / blocksize combinations will have
+ * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+ * default size, but get poor distribution when
+ * limited to exactly 256 megabytes.
+ *
+ * As an example, 16K clustersize at 4K blocksize
+ * gives us a cluster group size of 504M. Paring the
+ * local alloc size down to 256 however, would give us
+ * only one window and around 200MB left in the
+ * cluster group. Instead, find the first size below
+ * 256 which would give us an even distribution.
+ *
+ * Larger cluster group sizes actually work out pretty
+ * well when pared to 256, so we don't have to do this
+ * for any group that fits more than two
+ * OCFS2_LA_MAX_DEFAULT_MB windows.
+ */
+ if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+ la_mb = 256;
+ else {
+ unsigned int gd_mult = gd_mb;
+
+ while (gd_mult > 256)
+ gd_mult = gd_mult >> 1;
+
+ la_mb = gd_mult;
+ }
+ }
+
+ megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+ megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+ /* Too many nodes, too few disk clusters. */
+ if (megs_per_slot < la_mb)
+ la_mb = megs_per_slot;
+
+ /* We can't store more bits than we can in a block. */
+ la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ ocfs2_local_alloc_size(sb) * 8);
+ if (la_mb > la_max_mb)
+ la_mb = la_max_mb;
+
+ return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+ struct super_block *sb = osb->sb;
+ unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+ unsigned int la_max_mb;
+
+ la_max_mb = ocfs2_clusters_to_megabytes(sb,
+ ocfs2_local_alloc_size(sb) * 8);
+
+ trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
+
+ if (requested_mb == -1) {
+ /* No user request - use defaults */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_default_mb);
+ } else if (requested_mb > la_max_mb) {
+ /* Request is too big, we give the maximum available */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_max_mb);
+ } else {
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, requested_mb);
+ }
+
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
{
return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -130,8 +275,8 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
ret = 1;
bail:
- mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
- osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+ trace_ocfs2_alloc_should_use_local(
+ (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
spin_unlock(&osb->osb_lock);
return ret;
}
@@ -145,8 +290,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
-
if (osb->local_alloc_bits == 0)
goto bail;
@@ -156,7 +299,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_bits, (osb->bitmap_cpg - 1));
osb->local_alloc_bits =
ocfs2_megabytes_to_clusters(osb->sb,
- OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+ ocfs2_la_default_mb(osb));
}
/* read the alloc off disk */
@@ -218,9 +361,10 @@ bail:
if (inode)
iput(inode);
- mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
+ trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -242,8 +386,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_dinode *alloc = NULL;
- mlog_entry_void();
-
cancel_delayed_work(&osb->la_enable_wq);
flush_workqueue(ocfs2_wq);
@@ -262,6 +404,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_state = OCFS2_LA_DISABLED;
+ ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -297,20 +441,15 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
}
memcpy(alloc_copy, alloc, bh->b_size);
- status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
+ bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
osb->local_alloc_bh = NULL;
@@ -337,10 +476,7 @@ out:
if (local_alloc_inode)
iput(local_alloc_inode);
- if (alloc_copy)
- kfree(alloc_copy);
-
- mlog_exit_void();
+ kfree(alloc_copy);
}
/*
@@ -359,7 +495,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
struct inode *inode = NULL;
struct ocfs2_dinode *alloc;
- mlog_entry("(slot_num = %d)\n", slot_num);
+ trace_ocfs2_begin_local_alloc_recovery(slot_num);
*alloc_copy = NULL;
@@ -392,12 +528,12 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
ocfs2_clear_local_alloc(alloc);
ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
- status = ocfs2_write_block(osb, alloc_bh, inode);
+ status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
if (status < 0)
mlog_errno(status);
bail:
- if ((status < 0) && (*alloc_copy)) {
+ if (status < 0) {
kfree(*alloc_copy);
*alloc_copy = NULL;
}
@@ -409,7 +545,8 @@ bail:
iput(inode);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -427,8 +564,6 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode;
- mlog_entry_void();
-
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -476,51 +611,12 @@ out_mutex:
out:
if (!status)
- ocfs2_init_inode_steal_slot(osb);
- mlog_exit(status);
+ ocfs2_init_steal_slots(osb);
+ if (status)
+ mlog_errno(status);
return status;
}
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
- struct ocfs2_alloc_context *ac,
- u32 bits_wanted)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *alloc;
- struct ocfs2_local_alloc *la;
- int start;
- u64 block_off;
-
- if (!ac->ac_max_block)
- return 1;
-
- alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
- la = OCFS2_LOCAL_ALLOC(alloc);
-
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
- if (start == -1) {
- mlog_errno(-ENOSPC);
- return 0;
- }
-
- /*
- * Converting (bm_off + start + bits_wanted) to blocks gives us
- * the blkno just past our actual allocation. This is perfect
- * to compare with ac_max_block.
- */
- block_off = ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(la->la_bm_off) +
- start + bits_wanted);
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)block_off,
- (unsigned long long)ac->ac_max_block);
- if (block_off > ac->ac_max_block)
- return 0;
-
- return 1;
-}
-
/*
* make sure we've got at least bits_wanted contiguous bits in the
* local alloc. You lose them when you drop i_mutex.
@@ -537,8 +633,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
struct inode *local_alloc_inode;
unsigned int free_bits;
- mlog_entry_void();
-
BUG_ON(!ac);
local_alloc_inode =
@@ -609,21 +703,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
- if (ac->ac_max_block)
- mlog(0, "Calling in_range for max block %llu\n",
- (unsigned long long)ac->ac_max_block);
-
- if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
- bits_wanted)) {
- /*
- * The window is outside ac->ac_max_block.
- * This errno tells the caller to keep localalloc enabled
- * but to get the allocation from the main bitmap.
- */
- status = -EFBIG;
- goto bail;
- }
-
ac->ac_inode = local_alloc_inode;
/* We should never use localalloc from another slot */
ac->ac_alloc_slot = osb->slot_num;
@@ -637,10 +716,12 @@ bail:
iput(local_alloc_inode);
}
- mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
- status);
+ trace_ocfs2_reserve_local_alloc_bits(
+ (unsigned long long)ac->ac_max_block,
+ bits_wanted, osb->slot_num, status);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -657,14 +738,14 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
local_alloc_inode = ac->ac_inode;
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+ start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+ ac->ac_resv);
if (start == -1) {
/* TODO: Shouldn't we just BUG here? */
status = -ENOSPC;
@@ -674,11 +755,10 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
bitmap = la->la_bitmap;
*bit_off = le32_to_cpu(la->la_bm_off) + start;
- /* local alloc is always contiguous by nature -- we never
- * delete bits from it! */
*num_bits = bits_wanted;
- status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
@@ -686,55 +766,114 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
+ ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+ bits_wanted);
+
while(bits_wanted--)
ocfs2_set_bit(start++, bitmap);
le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+ if (status)
+ mlog_errno(status);
+ return status;
+}
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits)
+{
+ int status, start;
+ u32 clear_bits;
+ struct inode *local_alloc_inode;
+ void *bitmap;
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+ local_alloc_inode = ac->ac_inode;
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ bitmap = la->la_bitmap;
+ start = bit_off - le32_to_cpu(la->la_bm_off);
+ clear_bits = num_bits;
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = 0;
+ while (clear_bits--)
+ ocfs2_clear_bit(start++, bitmap);
+
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
bail:
- mlog_exit(status);
return status;
}
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
- int i;
- u8 *buffer;
- u32 count = 0;
+ u32 count;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
- mlog_entry_void();
-
- buffer = la->la_bitmap;
- for (i = 0; i < le16_to_cpu(la->la_size); i++)
- count += hweight8(buffer[i]);
+ count = memweight(la->la_bitmap, le16_to_cpu(la->la_size));
- mlog_exit(count);
+ trace_ocfs2_local_alloc_count_bits(count);
return count;
}
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
- struct ocfs2_dinode *alloc,
- u32 numbits)
+ struct ocfs2_dinode *alloc,
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv)
{
int numfound, bitoff, left, startoff, lastzero;
+ int local_resv = 0;
+ struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
-
- mlog_entry("(numbits wanted = %u)\n", numbits);
+ struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
if (!alloc->id1.bitmap1.i_total) {
- mlog(0, "No bits in my window!\n");
bitoff = -1;
goto bail;
}
+ if (!resv) {
+ local_resv = 1;
+ ocfs2_resv_init_once(&r);
+ ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+ resv = &r;
+ }
+
+ numfound = *numbits;
+ if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+ if (numfound < *numbits)
+ *numbits = numfound;
+ goto bail;
+ }
+
+ /*
+ * Code error. While reservations are enabled, local
+ * allocation should _always_ go through them.
+ */
+ BUG_ON(osb->osb_resv_level != 0);
+
+ /*
+ * Reservations are disabled. Handle this the old way.
+ */
+
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
@@ -760,22 +899,27 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
startoff = bitoff+1;
}
/* we got everything we needed */
- if (numfound == numbits) {
+ if (numfound == *numbits) {
/* mlog(0, "Found it all!\n"); */
break;
}
}
- mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
- numfound);
+ trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
- if (numfound == numbits)
+ if (numfound == *numbits)
bitoff = startoff - numfound;
else
bitoff = -1;
bail:
- mlog_exit(bitoff);
+ if (local_resv)
+ ocfs2_resv_discard(resmap, resv);
+
+ trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ bitoff, numfound);
+
return bitoff;
}
@@ -783,15 +927,12 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
{
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
int i;
- mlog_entry_void();
alloc->id1.bitmap1.i_total = 0;
alloc->id1.bitmap1.i_used = 0;
la->la_bm_off = 0;
for(i = 0; i < le16_to_cpu(la->la_size); i++)
la->la_bitmap[i] = 0;
-
- mlog_exit_void();
}
#if 0
@@ -832,18 +973,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
void *bitmap;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
- mlog_entry("total = %u, used = %u\n",
- le32_to_cpu(alloc->id1.bitmap1.i_total),
- le32_to_cpu(alloc->id1.bitmap1.i_used));
+ trace_ocfs2_sync_local_to_main(
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ le32_to_cpu(alloc->id1.bitmap1.i_used));
if (!alloc->id1.bitmap1.i_total) {
- mlog(0, "nothing to sync!\n");
goto bail;
}
if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
le32_to_cpu(alloc->id1.bitmap1.i_total)) {
- mlog(0, "all bits were taken!\n");
goto bail;
}
@@ -865,14 +1004,15 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
ocfs2_clusters_to_blocks(osb->sb,
start - count);
- mlog(0, "freeing %u bits starting at local alloc bit "
- "%u (la_start_blk = %llu, blkno = %llu)\n",
+ trace_ocfs2_sync_local_to_main_free(
count, start - count,
(unsigned long long)la_start_blk,
(unsigned long long)blkno);
- status = ocfs2_free_clusters(handle, main_bm_inode,
- main_bm_bh, blkno, count);
+ status = ocfs2_release_clusters(handle,
+ main_bm_inode,
+ main_bm_bh, blkno,
+ count);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -885,7 +1025,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -984,7 +1125,6 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
retry_enospc:
(*ac)->ac_bits_wanted = osb->local_alloc_bits;
-
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
if (status == -ENOSPC) {
if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1011,7 +1151,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1027,17 +1168,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc = NULL;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
-
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- if (alloc->id1.bitmap1.i_total)
- mlog(0, "asking me to alloc a new window over a non-empty "
- "one\n");
-
- mlog(0, "Allocating %u clusters for a new window.\n",
- osb->local_alloc_bits);
+ trace_ocfs2_local_alloc_new_window(
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ osb->local_alloc_bits);
/* Instruct the allocation code to try the most recently used
* cluster group. We'll re-record the group used this pass
@@ -1047,7 +1183,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
- status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+ status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count);
if (status == -ENOSPC) {
retry_enospc:
@@ -1060,7 +1196,8 @@ retry_enospc:
OCFS2_LA_DISABLED)
goto bail;
- status = ocfs2_claim_clusters(osb, handle, ac,
+ ac->ac_bits_wanted = osb->local_alloc_bits;
+ status = ocfs2_claim_clusters(handle, ac,
osb->local_alloc_bits,
&cluster_off,
&cluster_count);
@@ -1095,13 +1232,16 @@ retry_enospc:
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
le16_to_cpu(la->la_size));
- mlog(0, "New window allocated:\n");
- mlog(0, "window la_bm_off = %u\n",
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
- mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+ ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+ OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
+ trace_ocfs2_local_alloc_new_window_result(
+ OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+ le32_to_cpu(alloc->id1.bitmap1.i_total));
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1118,8 +1258,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_alloc_context *ac = NULL;
- mlog_entry_void();
-
ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
/* This will lock the main bitmap for us. */
@@ -1156,7 +1294,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
- status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
@@ -1165,12 +1304,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
main_bm_inode, main_bm_bh);
@@ -1188,7 +1322,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
atomic_inc(&osb->alloc_stats.moves);
- status = 0;
bail:
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -1198,13 +1331,13 @@ bail:
if (main_bm_inode)
iput(main_bm_inode);
- if (alloc_copy)
- kfree(alloc_copy);
+ kfree(alloc_copy);
if (ac)
ocfs2_free_alloc_context(ac);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f8665..44a7d1fb2de 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
@@ -52,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits);
+
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters);
void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac624517..6b6d092b099 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/fcntl.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -83,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
}
ret = flock_lock_file_wait(file, fl);
+ if (ret)
+ ocfs2_file_unlock(file);
out:
mutex_unlock(&fp->fp_mutex);
@@ -133,7 +134,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(inode))
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72e..10d66c75cec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,14 +25,12 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/signal.h>
#include <linux/rbtree.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -42,52 +40,29 @@
#include "file.h"
#include "inode.h"
#include "mmap.h"
+#include "super.h"
+#include "ocfs2_trace.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
- /* The best way to deal with signals in the vm path is
- * to block them upfront, rather than allowing the
- * locking paths to return -ERESTARTSYS. */
- sigfillset(blocked);
-
- /* We should technically never get a bad return value
- * from sigprocmask */
- return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
- return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
{
- sigset_t blocked, oldset;
- int error, ret;
-
- mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-
- error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
- if (error < 0) {
- mlog_errno(error);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
+ sigset_t oldset;
+ int ret;
+ ocfs2_block_signals(&oldset);
ret = filemap_fault(area, vmf);
+ ocfs2_unblock_signals(&oldset);
- error = ocfs2_vm_op_unblock_sigs(&oldset);
- if (error < 0)
- mlog_errno(error);
-out:
- mlog_exit_ptr(vmf->page);
+ trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+ area, vmf->page, vmf->pgoff);
return ret;
}
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
- int ret;
+ int ret = VM_FAULT_NOPAGE;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
unsigned int len = PAGE_CACHE_SIZE;
@@ -96,30 +71,25 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
void *fsdata;
loff_t size = i_size_read(inode);
- /*
- * Another node might have truncated while we were waiting on
- * cluster locks.
- */
- last_index = size >> PAGE_CACHE_SHIFT;
- if (page->index > last_index) {
- ret = -EINVAL;
- goto out;
- }
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
/*
- * The i_size check above doesn't catch the case where nodes
- * truncated and then re-extended the file. We'll re-check the
- * page mapping after taking the page lock inside of
- * ocfs2_write_begin_nolock().
+ * There are cases that lead to the page no longer bebongs to the
+ * mapping.
+ * 1) pagecache truncates locally due to memory pressure.
+ * 2) pagecache truncates when another is taking EX lock against
+ * inode lock. see ocfs2_data_convert_worker.
+ *
+ * The i_size check doesn't catch the case where nodes truncated and
+ * then re-extended the file. We'll re-check the page mapping after
+ * taking the page lock inside of ocfs2_write_begin_nolock().
+ *
+ * Let VM retry with these cases.
*/
- if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
- /*
- * the page has been umapped in ocfs2_data_downconvert_worker.
- * So return 0 here and let VFS retry.
- */
- ret = 0;
+ if ((page->mapping != inode->i_mapping) ||
+ (!PageUptodate(page)) ||
+ (page_offset(page) >= size))
goto out;
- }
/*
* Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -132,24 +102,28 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
- len = size & ~PAGE_CACHE_MASK;
+ len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
&fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
- if (ret < 0) {
- mlog_errno(ret);
+ if (!locked_page) {
+ ret = VM_FAULT_NOPAGE;
goto out;
}
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
BUG_ON(ret != len);
- ret = 0;
+ ret = VM_FAULT_LOCKED;
out:
return ret;
}
@@ -157,16 +131,13 @@ out:
static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct buffer_head *di_bh = NULL;
- sigset_t blocked, oldset;
- int ret, ret2;
+ sigset_t oldset;
+ int ret;
- ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
+ sb_start_pagefault(inode->i_sb);
+ ocfs2_block_signals(&oldset);
/*
* The cluster locks taken will block a truncate from another
@@ -186,7 +157,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+ ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -194,33 +165,30 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ocfs2_inode_unlock(inode, 1);
out:
- ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
- if (ret2 < 0)
- mlog_errno(ret2);
- if (ret)
- ret = VM_FAULT_SIGBUS;
+ ocfs2_unblock_signals(&oldset);
+ sb_end_pagefault(inode->i_sb);
return ret;
}
-static struct vm_operations_struct ocfs2_file_vm_ops = {
+static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret = 0, lock_level = 0;
- ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
- file->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(file_inode(file),
+ file->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
+ ocfs2_inode_unlock(file_inode(file), lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 00000000000..599eb4c4c8b
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1078 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+ struct inode *inode;
+ struct file *file;
+ int auto_defrag;
+ int partial;
+ int credits;
+ u32 new_phys_cpos;
+ u32 clusters_moved;
+ u64 refcount_loc;
+ struct ocfs2_move_extents *range;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+ struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+ int ext_flags)
+{
+ int ret = 0, index;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_rec *rec, replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+ u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+ ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
+ p_cpos, new_p_cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+ new_p_cpos));
+
+ path = ocfs2_new_path_from_et(&context->et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ rec = &el->l_recs[index];
+
+ BUG_ON(ext_flags != rec->e_flags);
+ /*
+ * after moving/defraging to new location, the extent is not going
+ * to be refcounted anymore.
+ */
+ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ context->et.et_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, &context->et, path, index,
+ &replace_rec, context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+ context->new_phys_cpos = new_p_cpos;
+
+ /*
+ * need I to append truncate log for old clusters?
+ */
+ if (old_blkno) {
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ old_blkno),
+ len, context->meta_ac,
+ &context->dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ old_blkno, len);
+ }
+
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_move,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int extra_blocks,
+ int *credits)
+{
+ int ret, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
+
+ mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+ extra_blocks, clusters_to_move, *credits);
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ * XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 new_phys_cpos, new_len;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ *len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+ &context->meta_ac,
+ &context->data_ac,
+ extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * should be using allocation reservation strategy there?
+ *
+ * if (context->data_ac)
+ * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ */
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
+ ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+ &new_phys_cpos, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * allowing partial extent moving is kind of 'pros and cons', it makes
+ * whole defragmentation less likely to fail, on the contrary, the bad
+ * thing is it may make the fs even more fragmented after moving, let
+ * userspace make a good decision here.
+ */
+ if (new_len != *len) {
+ mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+ if (!partial) {
+ context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+ }
+
+ mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+ phys_cpos, new_phys_cpos);
+
+ ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+ new_phys_cpos, ext_flags);
+ if (ret)
+ mlog_errno(ret);
+
+ if (partial && (new_len != *len))
+ *len = new_len;
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+ u64 vict_blkno,
+ int type, int slot,
+ int *vict_bit,
+ struct buffer_head **ret_bh)
+{
+ int ret, i, bits_per_unit = 0;
+ u64 blkno;
+ char namebuf[40];
+
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_chain_rec *rec;
+ struct ocfs2_dinode *ac_dinode;
+ struct ocfs2_group_desc *bg;
+
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+ strlen(namebuf), &blkno);
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+ cl = &(ac_dinode->id2.i_chain);
+ rec = &(cl->cl_recs[0]);
+
+ if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+ bits_per_unit = osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits;
+ /*
+ * 'vict_blkno' was out of the valid range.
+ */
+ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+ (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+ bits_per_unit))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+ rec = &(cl->cl_recs[i]);
+ if (!rec)
+ continue;
+
+ bg = NULL;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (gd_bh) {
+ brelse(gd_bh);
+ gd_bh = NULL;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+ le16_to_cpu(bg->bg_bits))) {
+
+ *ret_bh = gd_bh;
+ *vict_bit = (vict_blkno - blkno) >>
+ bits_per_unit;
+ mlog(0, "find the victim group: #%llu, "
+ "total_bits: %u, vict_bit: %u\n",
+ blkno, le16_to_cpu(bg->bg_bits),
+ *vict_bit);
+ goto out;
+ }
+
+ } while (le64_to_cpu(bg->bg_next_group));
+ }
+
+ ret = -EINVAL;
+out:
+ brelse(ac_bh);
+
+ /*
+ * caller has to release the gd_bh properly.
+ */
+ return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+ struct ocfs2_move_extents *range)
+{
+ int ret, goal_bit = 0;
+
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int c_to_b = 1 << (osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits);
+
+ /*
+ * make goal become cluster aligned.
+ */
+ range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+ range->me_goal);
+ /*
+ * validate goal sits within global_bitmap, and return the victim
+ * group desc
+ */
+ ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret)
+ goto out;
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ /*
+ * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * let's compromise to the latter cluster.
+ */
+ if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+ range->me_goal += c_to_b;
+
+ /*
+ * movement is not gonna cross two groups.
+ */
+ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+ range->me_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /*
+ * more exact validations/adjustments will be performed later during
+ * moving operation for each extent range.
+ */
+ mlog(0, "extents get ready to be moved to #%llu block\n",
+ range->me_goal);
+
+out:
+ brelse(gd_bh);
+
+ return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+ int *goal_bit, u32 move_len, u32 max_hop,
+ u32 *phys_cpos)
+{
+ int i, used, last_free_bits = 0, base_bit = *goal_bit;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+ u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(gd->bg_blkno));
+
+ for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+ used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+ if (used) {
+ /*
+ * we even tried searching the free chunk by jumping
+ * a 'max_hop' distance, but still failed.
+ */
+ if ((i - base_bit) > max_hop) {
+ *phys_cpos = 0;
+ break;
+ }
+
+ if (last_free_bits)
+ last_free_bits = 0;
+
+ continue;
+ } else
+ last_free_bits++;
+
+ if (last_free_bits == move_len) {
+ *goal_bit = i;
+ *phys_cpos = base_cpos + i;
+ break;
+ }
+ }
+
+ mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+ u32 len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct inode *gb_inode = NULL;
+ struct buffer_head *gb_bh = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *gd;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+ context->range->me_threshold);
+ u64 phys_blkno, new_phys_blkno;
+
+ phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+ &context->meta_ac,
+ NULL, extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * need to count 2 extra credits for global_bitmap inode and
+ * group descriptor.
+ */
+ credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+ /*
+ * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+ * logic, while we still need to lock the global_bitmap.
+ */
+ gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ mutex_lock(&gb_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_gb_mutex;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_tl_inode;
+ }
+
+ new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+ ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * probe the victim cluster group to find a proper
+ * region to fit wanted movement, it even will perfrom
+ * a best-effort attempt by compromising to a threshold
+ * around the goal.
+ */
+ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+ new_phys_cpos);
+ if (!*new_phys_cpos) {
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+
+ ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+ *new_phys_cpos, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+ goal_bit, len);
+ if (ret) {
+ ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ mlog_errno(ret);
+ }
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+ brelse(gd_bh);
+
+out_unlock_tl_inode:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+ mutex_unlock(&gb_inode->i_mutex);
+ brelse(gb_bh);
+ iput(gb_inode);
+
+out:
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+ u32 threshold, int *skip)
+{
+ if ((*alloc_size + *len_defraged) < threshold) {
+ /*
+ * proceed defragmentation until we meet the thresh
+ */
+ *len_defraged += *alloc_size;
+ } else if (*len_defraged == 0) {
+ /*
+ * XXX: skip a large extent.
+ */
+ *skip = 1;
+ } else {
+ /*
+ * split this extent to coalesce with former pieces as
+ * to reach the threshold.
+ *
+ * we're done here with one cycle of defragmentation
+ * in a size of 'thresh', resetting 'len_defraged'
+ * forces a new defragmentation.
+ */
+ *alloc_size = threshold - *len_defraged;
+ *len_defraged = 0;
+ }
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+ struct ocfs2_move_extents_context *context)
+{
+ int ret = 0, flags, do_defrag, skip = 0;
+ u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+ u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_move_extents *range = context->range;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if ((i_size_read(inode) == 0) || (range->me_len == 0))
+ return 0;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+ ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ /*
+ * TO-DO XXX:
+ *
+ * - xattr extents.
+ */
+
+ do_defrag = context->auto_defrag;
+
+ /*
+ * extents moving happens in unit of clusters, for the sake
+ * of simplicity, we may ignore two clusters where 'byte_start'
+ * and 'byte_start + len' were within.
+ */
+ move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+ len_to_move = (range->me_start + range->me_len) >>
+ osb->s_clustersize_bits;
+ if (len_to_move >= move_start)
+ len_to_move -= move_start;
+ else
+ len_to_move = 0;
+
+ if (do_defrag) {
+ defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+ if (defrag_thresh <= 1)
+ goto done;
+ } else
+ new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ range->me_goal);
+
+ mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+ "thresh: %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range->me_start,
+ (unsigned long long)range->me_len,
+ move_start, len_to_move, defrag_thresh);
+
+ cpos = move_start;
+ while (len_to_move) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+ &flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > len_to_move)
+ alloc_size = len_to_move;
+
+ /*
+ * XXX: how to deal with a hole:
+ *
+ * - skip the hole of course
+ * - force a new defragmentation
+ */
+ if (!phys_cpos) {
+ if (do_defrag)
+ len_defraged = 0;
+
+ goto next;
+ }
+
+ if (do_defrag) {
+ ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+ defrag_thresh, &skip);
+ /*
+ * skip large extents
+ */
+ if (skip) {
+ skip = 0;
+ goto next;
+ }
+
+ mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+ "alloc_size: %u, len_defraged: %u\n",
+ cpos, phys_cpos, alloc_size, len_defraged);
+
+ ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+ &alloc_size, flags);
+ } else {
+ ret = ocfs2_move_extent(context, cpos, phys_cpos,
+ &new_phys_cpos, alloc_size,
+ flags);
+
+ new_phys_cpos += alloc_size;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->clusters_moved += alloc_size;
+next:
+ cpos += alloc_size;
+ len_to_move -= alloc_size;
+ }
+
+done:
+ range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+ range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+ context->clusters_moved);
+ range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+ context->new_phys_cpos);
+
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+
+ return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+ int status;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!inode)
+ return -ENOENT;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * This prevents concurrent writes from other nodes
+ */
+ status = ocfs2_rw_lock(inode, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_rw_unlock;
+ }
+
+ /*
+ * rememer ip_xattr_sem also needs to be held if necessary
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ status = __ocfs2_move_extents_range(di_bh, context);
+
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ if (status) {
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ /*
+ * We update ctime for these changes
+ */
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+ ocfs2_rw_unlock(inode, 1);
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+ int status;
+
+ struct inode *inode = file_inode(filp);
+ struct ocfs2_move_extents range;
+ struct ocfs2_move_extents_context *context;
+
+ if (!argp)
+ return -EINVAL;
+
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+
+ if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+ status = -EPERM;
+ goto out_drop;
+ }
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ status = -EPERM;
+ goto out_drop;
+ }
+
+ context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+ if (!context) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_drop;
+ }
+
+ context->inode = inode;
+ context->file = filp;
+
+ if (copy_from_user(&range, argp, sizeof(range))) {
+ status = -EFAULT;
+ goto out_free;
+ }
+
+ if (range.me_start > i_size_read(inode)) {
+ status = -EINVAL;
+ goto out_free;
+ }
+
+ if (range.me_start + range.me_len > i_size_read(inode))
+ range.me_len = i_size_read(inode) - range.me_start;
+
+ context->range = &range;
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+ context->auto_defrag = 1;
+ /*
+ * ok, the default theshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+ context->partial = 1;
+ } else {
+ /*
+ * first best-effort attempt to validate and adjust the goal
+ * (physical address in block), while it can't guarantee later
+ * operation can succeed all the time since global_bitmap may
+ * change a bit over time.
+ */
+
+ status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+ if (status)
+ goto out_copy;
+ }
+
+ status = ocfs2_move_extents(context);
+ if (status)
+ mlog_errno(status);
+out_copy:
+ /*
+ * movement/defragmentation may end up being partially completed,
+ * that's the reason why we need to return userspace the finished
+ * length and new_offset even if failure happens somewhere.
+ */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ status = -EFAULT;
+
+out_free:
+ kfree(context);
+out_drop:
+ mnt_drop_write_file(filp);
+
+ return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 00000000000..4e143e81144
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 33464c6b60a..8add6f1030d 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -42,7 +42,6 @@
#include <linux/highmem.h>
#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -63,13 +62,13 @@
#include "uptodate.h"
#include "xattr.h"
#include "acl.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
struct inode *inode,
- struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
@@ -78,14 +77,14 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
- struct inode *inode,
+ u64 blkno,
char *name,
struct ocfs2_dir_lookup_result *lookup);
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct ocfs2_dinode *fe,
+ struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
struct inode *orphan_dir_inode);
@@ -99,7 +98,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
int status;
u64 blkno;
@@ -107,18 +106,16 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *ret;
struct ocfs2_inode_info *oi;
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
ret = ERR_PTR(-ENAMETOOLONG);
goto bail;
}
- mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
- dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-
- status = ocfs2_inode_lock(dir, NULL, 0);
+ status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -148,7 +145,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
spin_unlock(&oi->ip_lock);
bail_add:
- dentry->d_op = &ocfs2_dentry_ops;
ret = d_splice_alias(inode, dentry);
if (inode) {
@@ -172,7 +168,8 @@ bail_add:
ret = ERR_PTR(status);
goto bail_unlock;
}
- }
+ } else
+ ocfs2_dentry_attach_gen(dentry);
bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
@@ -183,12 +180,12 @@ bail_unlock:
bail:
- mlog_exit_ptr(ret);
+ trace_ocfs2_lookup_ret(ret);
return ret;
}
-static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
{
struct inode *inode;
@@ -202,24 +199,30 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
* these are used by the support functions here and in
* callers. */
if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
- vfs_dq_init(inode);
+ set_nlink(inode, 2);
+ inode_init_owner(inode, dir, mode);
+ dquot_initialize(inode);
return inode;
}
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+ struct dentry *dentry, struct inode *inode)
+{
+ struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+}
+
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
dev_t dev)
{
int status = 0;
@@ -240,10 +243,16 @@ static int ocfs2_mknod(struct inode *dir,
};
int did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
+ struct ocfs2_dentry_lock *dl = NULL;
+
+ trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long)dev, mode);
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
- (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
+ dquot_initialize(dir);
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
@@ -297,7 +306,7 @@ static int ocfs2_mknod(struct inode *dir,
}
/* get security xattr */
- status = ocfs2_init_security_get(inode, dir, &si);
+ status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
if (status) {
if (status == -EOPNOTSUPP)
si.enable = 0;
@@ -339,6 +348,12 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -349,17 +364,17 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
+ status = dquot_alloc_inode(inode);
+ if (status)
goto leave;
- }
did_quota_inode = 1;
/* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
+ status = ocfs2_mknod_locked(osb, dir, inode, dev,
&new_fe_bh, parent_fe_bh, handle,
inode_ac);
if (status < 0) {
@@ -375,23 +390,29 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
+ parent_fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
}
ocfs2_add_links_count(dirfe, 1);
- status = ocfs2_journal_dirty(handle, parent_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, parent_fe_bh);
inc_nlink(dir);
}
- status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
- meta_ac, data_ac);
+ if (default_acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_DEFAULT, default_acl,
+ meta_ac, data_ac);
+ }
+ if (!status && acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_ACCESS, acl,
+ meta_ac, data_ac);
+ }
+
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -406,48 +427,51 @@ static int ocfs2_mknod(struct inode *dir,
}
}
- status = ocfs2_add_entry(handle, dentry, inode,
- OCFS2_I(inode)->ip_blkno, parent_fe_bh,
- &lookup);
- if (status < 0) {
+ /*
+ * Do this before adding the entry to the directory. We add
+ * also set d_op after success so that ->d_iput() will cleanup
+ * the dentry lock even if ocfs2_add_entry() fails below.
+ */
+ status = ocfs2_dentry_attach_lock(dentry, inode,
+ OCFS2_I(dir)->ip_blkno);
+ if (status) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_dentry_attach_lock(dentry, inode,
- OCFS2_I(dir)->ip_blkno);
- if (status) {
+ dl = dentry->d_fsdata;
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+ &lookup);
+ if (status < 0) {
mlog_errno(status);
goto leave;
}
insert_inode_hash(inode);
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
status = 0;
leave:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
ocfs2_inode_unlock(dir, 1);
-
- if (status == -ENOSPC)
- mlog(0, "Disk is full\n");
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
- if ((status < 0) && inode) {
- clear_nlink(inode);
- iput(inode);
- }
-
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
@@ -457,41 +481,44 @@ leave:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
- mlog_exit(status);
+ /*
+ * We should call iput after the i_mutex of the bitmap been
+ * unlocked in ocfs2_free_alloc_context, or the
+ * ocfs2_delete_inode will mutex_lock again.
+ */
+ if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+ clear_nlink(inode);
+ iput(inode);
+ }
+
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
- struct inode *dir,
- struct inode *inode,
- struct dentry *dentry,
- dev_t dev,
- struct buffer_head **new_fe_bh,
- struct buffer_head *parent_fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *inode_ac)
+static int __ocfs2_mknod_locked(struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac,
+ u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
- u64 fe_blkno = 0;
- u16 suballoc_bit;
u16 feat;
-
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
- inode->i_mode, (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
*new_fe_bh = NULL;
- status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
- inode_ac, &suballoc_bit, &fe_blkno);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
@@ -503,13 +530,14 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
if (!*new_fe_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto leave;
}
- ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
- status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ *new_fe_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -522,10 +550,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_generation = cpu_to_le32(inode->i_generation);
fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
fe->i_blkno = cpu_to_le64(fe_blkno);
+ fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
@@ -534,7 +563,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
- le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(CURRENT_TIME.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -558,22 +587,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
}
- status = ocfs2_journal_dirty(handle, *new_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, *new_fe_bh);
ocfs2_populate_inode(inode, fe, 1);
- ocfs2_inode_set_new(osb, inode);
+ ocfs2_ci_set_new(osb, INODE_CACHE(inode));
if (!ocfs2_mount_local(osb)) {
status = ocfs2_create_new_inode_locks(inode);
if (status < 0)
mlog_errno(status);
}
- status = 0; /* error in ocfs2_create_new_inode_locks is not
- * critical */
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
leave:
if (status < 0) {
@@ -583,35 +608,66 @@ leave:
}
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+ struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac)
+{
+ int status = 0;
+ u64 suballoc_loc, fe_blkno = 0;
+ u16 suballoc_bit;
+
+ *new_fe_bh = NULL;
+
+ status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+ inode_ac, &suballoc_loc,
+ &suballoc_bit, &fe_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ parent_fe_bh, handle, inode_ac,
+ fe_blkno, suballoc_loc, suballoc_bit);
+}
+
static int ocfs2_mkdir(struct inode *dir,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
int ret;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ OCFS2_I(dir)->ip_blkno, mode);
ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
static int ocfs2_create(struct inode *dir,
struct dentry *dentry,
- int mode,
- struct nameidata *nd)
+ umode_t mode,
+ bool excl)
{
int ret;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
@@ -628,15 +684,19 @@ static int ocfs2_link(struct dentry *old_dentry,
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ u64 old_de_ino;
- mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
- old_dentry->d_name.len, old_dentry->d_name.name,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ dentry->d_name.len, dentry->d_name.name);
if (S_ISDIR(inode->i_mode))
return -EPERM;
- err = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+ dquot_initialize(dir);
+
+ err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
@@ -648,6 +708,22 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out;
}
+ err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ old_dentry->d_name.len, &old_de_ino);
+ if (err) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * Check whether another node removed the source inode while we
+ * were in the vfs.
+ */
+ if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+ err = -ENOENT;
+ goto out;
+ }
+
err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
dentry->d_name.len);
if (err)
@@ -682,7 +758,10 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
- err = ocfs2_journal_access_di(handle, inode, fe_bh,
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+
+ err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
mlog_errno(err);
@@ -694,14 +773,7 @@ static int ocfs2_link(struct dentry *old_dentry,
ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
- err = ocfs2_journal_dirty(handle, fe_bh);
- if (err < 0) {
- ocfs2_add_links_count(fe, -1);
- drop_nlink(inode);
- mlog_errno(err);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno,
@@ -719,12 +791,12 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_commit;
}
- atomic_inc(&inode->i_count);
- dentry->d_op = &ocfs2_dentry_ops;
+ ihold(inode);
d_instantiate(dentry, inode);
out_commit:
ocfs2_commit_trans(osb, handle);
+ ocfs2_unblock_signals(&oldset);
out_unlock_inode:
ocfs2_inode_unlock(inode, 1);
@@ -736,7 +808,8 @@ out:
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(err);
+ if (err)
+ mlog_errno(err);
return err;
}
@@ -758,7 +831,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
return ret;
}
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
{
if (S_ISDIR(inode->i_mode)) {
if (inode->i_nlink == 2)
@@ -776,6 +849,7 @@ static int ocfs2_unlink(struct inode *dir,
{
int status;
int child_locked = 0;
+ bool is_unlinkable = false;
struct inode *inode = dentry->d_inode;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -788,19 +862,20 @@ static int ocfs2_unlink(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
- BUG_ON(dentry->d_parent->d_inode != dir);
+ dquot_initialize(dir);
- mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ BUG_ON(dentry->d_parent->d_inode != dir);
- if (inode == osb->root_inode) {
- mlog(0, "Cannot delete the root directory\n");
+ if (inode == osb->root_inode)
return -EPERM;
- }
- status = ocfs2_inode_lock(dir, &parent_node_bh, 1);
+ status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
+ OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -819,9 +894,10 @@ static int ocfs2_unlink(struct inode *dir,
if (OCFS2_I(inode)->ip_blkno != blkno) {
status = -ENOENT;
- mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
+ trace_ocfs2_unlink_noent(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno,
+ OCFS2_I(inode)->ip_flags);
goto leave;
}
@@ -848,13 +924,15 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- if (inode_is_unlinkable(inode)) {
- status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
+ if (ocfs2_inode_is_unlinkable(inode)) {
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+ OCFS2_I(inode)->ip_blkno,
orphan_name, &orphan_insert);
if (status < 0) {
mlog_errno(status);
goto leave;
}
+ is_unlinkable = true;
}
handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -865,7 +943,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -874,15 +952,6 @@ static int ocfs2_unlink(struct inode *dir,
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- if (inode_is_unlinkable(inode)) {
- status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
- }
-
/* delete the name from the parent dir */
status = ocfs2_delete_entry(handle, dir, &lookup);
if (status < 0) {
@@ -894,12 +963,7 @@ static int ocfs2_unlink(struct inode *dir,
drop_nlink(inode);
drop_nlink(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (S_ISDIR(inode->i_mode))
@@ -910,6 +974,14 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
if (S_ISDIR(inode->i_mode))
inc_nlink(dir);
+ goto leave;
+ }
+
+ if (is_unlinkable) {
+ status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+ orphan_name, &orphan_insert, orphan_dir);
+ if (status < 0)
+ mlog_errno(status);
}
leave:
@@ -934,11 +1006,71 @@ leave:
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
+ mlog_errno(status);
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog(ML_NOTICE, "max lookup times reached, filesystem "
+ "may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
@@ -950,25 +1082,41 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
struct inode *inode2)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
struct inode *tmpinode;
- mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
- (unsigned long long)oi1->ip_blkno,
- (unsigned long long)oi2->ip_blkno);
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
if (*bh1)
*bh1 = NULL;
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
+
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
- mlog(0, "switching them around...\n");
tmpbh = bh2;
bh2 = bh1;
bh1 = tmpbh;
@@ -978,7 +1126,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
inode1 = tmpinode;
}
/* lock id2 */
- status = ocfs2_inode_lock(inode2, bh2, 1);
+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ OI_LS_RENAME1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -987,21 +1136,29 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
- status = ocfs2_inode_lock(inode1, bh1, 1);
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
if (status < 0) {
/*
* An error return must mean that no cluster locks
* were held on function exit.
*/
- if (oi1->ip_blkno != oi2->ip_blkno)
+ if (oi1->ip_blkno != oi2->ip_blkno) {
ocfs2_inode_unlock(inode2, 1);
+ brelse(*bh2);
+ *bh2 = NULL;
+ }
if (status != -ENOENT)
mlog_errno(status);
}
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1032,21 +1189,24 @@ static int ocfs2_rename(struct inode *old_dir,
handle_t *handle = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *new_dir_bh = NULL;
- nlink_t old_dir_nlink = old_dir->i_nlink;
+ u32 old_dir_nlink = old_dir->i_nlink;
struct ocfs2_dinode *old_di;
struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
struct ocfs2_dir_lookup_result target_insert = { NULL, };
+ bool should_add_orphan = false;
/* At some point it might be nice to break this function up a
* bit. */
- mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
- old_dir, old_dentry, new_dir, new_dentry,
- old_dentry->d_name.len, old_dentry->d_name.name,
- new_dentry->d_name.len, new_dentry->d_name.name);
+ trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ new_dentry->d_name.len, new_dentry->d_name.name);
+
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
osb = OCFS2_SB(old_dir->i_sb);
@@ -1073,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;
+
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ trace_ocfs2_rename_not_permitted(
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
@@ -1103,7 +1278,8 @@ static int ocfs2_rename(struct inode *old_dir,
* won't have to concurrently downconvert the inode and the
* dentry locks.
*/
- status = ocfs2_inode_lock(old_inode, &old_inode_bh, 1);
+ status = ocfs2_inode_lock_nested(old_inode, &old_inode_bh, 1,
+ OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1199,16 +1375,15 @@ static int ocfs2_rename(struct inode *old_dir,
if (!new_inode) {
status = -EACCES;
- mlog(0, "We found an inode for name %.*s but VFS "
- "didn't give us one.\n", new_dentry->d_name.len,
- new_dentry->d_name.name);
+ trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+ new_dentry->d_name.name);
goto bail;
}
if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
status = -EACCES;
- mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
+ trace_ocfs2_rename_disagree(
(unsigned long long)OCFS2_I(new_inode)->ip_blkno,
(unsigned long long)newfe_blkno,
OCFS2_I(new_inode)->ip_flags);
@@ -1231,20 +1406,19 @@ static int ocfs2_rename(struct inode *old_dir,
newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
- mlog(0, "aha rename over existing... new_blkno=%llu "
- "newfebh=%p bhblocknr=%llu\n",
+ trace_ocfs2_rename_over_existing(
(unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
(unsigned long long)newfe_bh->b_blocknr : 0ULL);
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
- new_inode,
- orphan_name,
- &orphan_insert);
+ OCFS2_I(new_inode)->ip_blkno,
+ orphan_name, &orphan_insert);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ should_add_orphan = true;
}
} else {
BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1281,24 +1455,14 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
}
- status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
+ newfe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- if (S_ISDIR(new_inode->i_mode) ||
- (ocfs2_read_links_count(newfe) == 1)) {
- status = ocfs2_orphan_add(osb, handle, new_inode,
- newfe, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
/* change the dirent to point to the correct inode */
status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
old_inode);
@@ -1312,11 +1476,15 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_set_links_count(newfe, 0);
else
ocfs2_add_links_count(newfe, -1);
-
- status = ocfs2_journal_dirty(handle, newfe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ ocfs2_journal_dirty(handle, newfe_bh);
+ if (should_add_orphan) {
+ status = ocfs2_orphan_add(osb, handle, new_inode,
+ newfe_bh, orphan_name,
+ &orphan_insert, orphan_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
} else {
/* if the name was not found in new_dir, add it now */
@@ -1328,17 +1496,15 @@ static int ocfs2_rename(struct inode *old_dir,
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
- status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
+ old_inode_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
-
- status = ocfs2_journal_dirty(handle, old_inode_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1362,7 +1528,7 @@ static int ocfs2_rename(struct inode *old_dir,
}
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
new_inode->i_ctime = CURRENT_TIME;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1370,9 +1536,9 @@ static int ocfs2_rename(struct inode *old_dir,
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
- old_dir->i_nlink--;
+ drop_nlink(old_dir);
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
} else {
inc_nlink(new_dir);
mark_inode_dirty(new_dir);
@@ -1404,12 +1570,13 @@ static int ocfs2_rename(struct inode *old_dir,
(int)old_dir_nlink, old_dir->i_nlink);
} else {
struct ocfs2_dinode *fe;
- status = ocfs2_journal_access_di(handle, old_dir,
- old_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(old_dir),
+ old_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
- status = ocfs2_journal_dirty(handle, old_dir_bh);
+ ocfs2_journal_dirty(handle, old_dir_bh);
}
}
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1454,7 +1621,8 @@ bail:
brelse(old_dir_bh);
brelse(new_dir_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1479,9 +1647,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
* write i_size + 1 bytes. */
blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
- mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
- (unsigned long long)inode->i_blocks,
- i_size_read(inode), blocks);
+ trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+ i_size_read(inode), blocks);
/* Sanity check -- make sure we're going to fit. */
if (bytes_left >
@@ -1524,9 +1691,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
+ bhs[virtual]);
- status = ocfs2_journal_access(handle, inode, bhs[virtual],
+ status = ocfs2_journal_access(handle, INODE_CACHE(inode),
+ bhs[virtual],
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
@@ -1539,11 +1708,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
(bytes_left > sb->s_blocksize) ? sb->s_blocksize :
bytes_left);
- status = ocfs2_journal_dirty(handle, bhs[virtual]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[virtual]);
virtual++;
p_blkno++;
@@ -1559,7 +1724,8 @@ bail:
kfree(bhs);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1587,9 +1753,14 @@ static int ocfs2_symlink(struct inode *dir,
};
int did_quota = 0, did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
- mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
- dentry, symname, dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_symlink_begin(dir, dentry, symname,
+ dentry->d_name.len, dentry->d_name.name);
+
+ dquot_initialize(dir);
sb = dir->i_sb;
osb = OCFS2_SB(sb);
@@ -1641,7 +1812,7 @@ static int ocfs2_symlink(struct inode *dir,
}
/* get security xattr */
- status = ocfs2_init_security_get(inode, dir, &si);
+ status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
if (status) {
if (status == -EOPNOTSUPP)
si.enable = 0;
@@ -1680,16 +1851,21 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- /* We don't use standard VFS wrapper because we don't want vfs_dq_init
- * to be called. */
- if (sb_any_quota_active(osb->sb) &&
- osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
- status = -EDQUOT;
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
+ status = dquot_alloc_inode(inode);
+ if (status)
goto bail;
- }
did_quota_inode = 1;
- status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+ trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ inode->i_mode);
+
+ status = ocfs2_mknod_locked(osb, dir, inode,
0, &new_fe_bh, parent_fe_bh, handle,
inode_ac);
if (status < 0) {
@@ -1700,16 +1876,16 @@ static int ocfs2_symlink(struct inode *dir,
fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
inode->i_rdev = 0;
newsize = l - 1;
+ inode->i_op = &ocfs2_symlink_inode_operations;
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
- inode->i_op = &ocfs2_symlink_inode_operations;
- if (vfs_dq_alloc_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, 1))) {
- status = -EDQUOT;
+ status = dquot_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status)
goto bail;
- }
did_quota = 1;
+ inode->i_mapping->a_ops = &ocfs2_aops;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
handle, data_ac, NULL,
@@ -1727,7 +1903,7 @@ static int ocfs2_symlink(struct inode *dir,
i_size_write(inode, newsize);
inode->i_blocks = ocfs2_inode_sector_count(inode);
} else {
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
memcpy((char *) fe->id2.i_symlink, symname, l);
i_size_write(inode, newsize);
inode->i_blocks = 0;
@@ -1757,37 +1933,44 @@ static int ocfs2_symlink(struct inode *dir,
}
}
- status = ocfs2_add_entry(handle, dentry, inode,
- le64_to_cpu(fe->i_blkno), parent_fe_bh,
- &lookup);
- if (status < 0) {
+ /*
+ * Do this before adding the entry to the directory. We add
+ * also set d_op after success so that ->d_iput() will cleanup
+ * the dentry lock even if ocfs2_add_entry() fails below.
+ */
+ status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+ if (status) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
- if (status) {
+ dl = dentry->d_fsdata;
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ le64_to_cpu(fe->i_blkno), parent_fe_bh,
+ &lookup);
+ if (status < 0) {
mlog_errno(status);
goto bail;
}
insert_inode_hash(inode);
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
bail:
if (status < 0 && did_quota)
- vfs_dq_free_space_nodirty(inode,
+ dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
- vfs_dq_free_inode(inode);
+ dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
ocfs2_inode_unlock(dir, 1);
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
if (inode_ac)
@@ -1797,11 +1980,16 @@ bail:
if (xattr_ac)
ocfs2_free_alloc_context(xattr_ac);
if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1810,8 +1998,6 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
{
int status, namelen;
- mlog_entry_void();
-
namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
(long long)blkno);
if (namelen <= 0) {
@@ -1828,76 +2014,133 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
goto bail;
}
- mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
- namelen);
+ trace_ocfs2_blkno_stringify(blkno, name, namelen);
status = 0;
bail:
- mlog_exit(status);
+ if (status < 0)
+ mlog_errno(status);
return status;
}
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
- struct inode **ret_orphan_dir,
- struct inode *inode,
- char *name,
- struct ocfs2_dir_lookup_result *lookup)
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ struct buffer_head **ret_orphan_dir_bh)
{
struct inode *orphan_dir_inode;
struct buffer_head *orphan_dir_bh = NULL;
- int status = 0;
-
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
- if (status < 0) {
- mlog_errno(status);
- return status;
- }
+ int ret = 0;
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -ENOENT;
- mlog_errno(status);
- return status;
+ ret = -ENOENT;
+ mlog_errno(ret);
+ return ret;
}
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
+ ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (ret < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+
+ mlog_errno(ret);
+ return ret;
}
- status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
- orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
- if (status < 0) {
- ocfs2_inode_unlock(orphan_dir_inode, 1);
+ *ret_orphan_dir = orphan_dir_inode;
+ *ret_orphan_dir_bh = orphan_dir_bh;
- mlog_errno(status);
- goto leave;
+ return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+ struct buffer_head *orphan_dir_bh,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+
+ ret = ocfs2_blkno_stringify(blkno, name);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+ orphan_dir_bh, name,
+ OCFS2_ORPHAN_NAMELEN, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ * ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ int ret = 0;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+ &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+ blkno, name, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
*ret_orphan_dir = orphan_dir_inode;
-leave:
- if (status) {
+out:
+ brelse(orphan_dir_bh);
+
+ if (ret) {
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
}
- brelse(orphan_dir_bh);
-
- mlog_exit(status);
- return status;
+ if (ret)
+ mlog_errno(ret);
+ return ret;
}
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct ocfs2_dinode *fe,
+ struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
struct inode *orphan_dir_inode)
@@ -1905,8 +2148,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh = NULL;
int status = 0;
struct ocfs2_dinode *orphan_fe;
+ struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
- mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+ trace_ocfs2_orphan_add_begin(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
if (status < 0) {
@@ -1914,7 +2159,24 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /*
+ * We're going to journal the change of i_flags and i_orphaned_slot.
+ * It's safe anyway, though some callers may duplicate the journaling.
+ * Journaling within the func just make the logic look more
+ * straightforward.
+ */
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1926,13 +2188,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, 1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
OCFS2_ORPHAN_NAMELEN, inode,
@@ -1940,23 +2197,32 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_dir_bh, lookup);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto rollback;
}
- le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+ OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
- mlog(0, "Inode %llu orphaned in slot %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
+ ocfs2_journal_dirty(handle, fe_bh);
+
+ trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ osb->slot_num);
+
+rollback:
+ if (status < 0) {
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, -1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ }
leave:
brelse(orphan_dir_bh);
- mlog_exit(status);
return status;
}
@@ -1972,17 +2238,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
int status = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry_void();
-
status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
- name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- OCFS2_ORPHAN_NAMELEN);
+ trace_ocfs2_orphan_del(
+ (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+ name, OCFS2_ORPHAN_NAMELEN);
/* find it's spot in the orphan directory */
status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
@@ -1999,7 +2263,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -2010,18 +2276,368 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, -1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
+
+leave:
+ ocfs2_free_dir_lookup_result(&lookup);
+
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+ struct buffer_head *dir_bh,
+ char *orphan_name,
+ struct inode **ret_orphan_dir,
+ u64 *ret_di_blkno,
+ struct ocfs2_dir_lookup_result *orphan_insert,
+ struct ocfs2_alloc_context **ret_inode_ac)
+{
+ int ret;
+ u64 di_blkno;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct inode *orphan_dir = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* reserve an inode spot */
+ ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+ &di_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+ di_blkno, orphan_name, orphan_insert);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret == 0) {
+ *ret_orphan_dir = orphan_dir;
+ *ret_di_blkno = di_blkno;
+ *ret_inode_ac = inode_ac;
+ /*
+ * orphan_name and orphan_insert are already up to
+ * date via prepare_orphan_dir
+ */
+ } else {
+ /* Unroll reserve_new_inode* */
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ /* Unroll orphan dir locking */
+ mutex_unlock(&orphan_dir->i_mutex);
+ ocfs2_inode_unlock(orphan_dir, 1);
+ iput(orphan_dir);
+ }
+
+ brelse(orphan_dir_bh);
+
+ return ret;
+}
+
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+ int mode,
+ struct inode **new_inode)
+{
+ int status, did_quota_inode = 0;
+ struct inode *inode = NULL;
+ struct inode *orphan_dir = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *di = NULL;
+ handle_t *handle = NULL;
+ char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+ struct buffer_head *parent_di_bh = NULL;
+ struct buffer_head *new_di_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ u64 uninitialized_var(di_blkno), suballoc_loc;
+ u16 suballoc_bit;
+
+ status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+ orphan_name, &orphan_dir,
+ &di_blkno, &orphan_insert, &inode_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ inode = ocfs2_get_init_inode(dir, mode);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = dquot_alloc_inode(inode);
+ if (status)
+ goto leave;
+ did_quota_inode = 1;
+
+ status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+ &suballoc_loc,
+ &suballoc_bit, di_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ clear_nlink(inode);
+ /* do the real work now. */
+ status = __ocfs2_mknod_locked(dir, inode,
+ 0, &new_di_bh, parent_di_bh, handle,
+ inode_ac, di_blkno, suballoc_loc,
+ suballoc_bit);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
+ di = (struct ocfs2_dinode *)new_di_bh->b_data;
+ status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
+ &orphan_insert, orphan_dir);
if (status < 0) {
mlog_errno(status);
goto leave;
}
+ /* get open lock so that only nodes can't remove it from orphan dir. */
+ status = ocfs2_open_lock(inode);
+ if (status < 0)
+ mlog_errno(status);
+
+ insert_inode_hash(inode);
leave:
+ if (status < 0 && did_quota_inode)
+ dquot_free_inode(inode);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
+
+ if (orphan_dir) {
+ /* This was locked for us in ocfs2_prepare_orphan_dir() */
+ ocfs2_inode_unlock(orphan_dir, 1);
+ mutex_unlock(&orphan_dir->i_mutex);
+ iput(orphan_dir);
+ }
+
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
+ iput(inode);
+ }
+
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ brelse(new_di_bh);
+
+ if (!status)
+ *new_inode = inode;
+
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
+ return status;
+}
+
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct inode *inode,
+ struct dentry *dentry)
+{
+ int status = 0;
+ struct buffer_head *parent_di_bh = NULL;
+ handle_t *handle = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct ocfs2_dinode *dir_di, *di;
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+ trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+ dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+
+ dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ if (!dir_di->i_links_count) {
+ /* can't make a file in a deleted directory. */
+ status = -ENOENT;
+ goto leave;
+ }
+
+ status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+ dentry->d_name.len);
+ if (status)
+ goto leave;
+
+ /* get a spot inside the dir. */
+ status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ dentry->d_name.name,
+ dentry->d_name.len, &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ osb->slot_num);
+ if (!orphan_dir_inode) {
+ status = -EEXIST;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ mutex_lock(&orphan_dir_inode->i_mutex);
+
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+ goto leave;
+ }
+
+ status = ocfs2_read_inode_block(inode, &di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto orphan_unlock;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto orphan_unlock;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+ orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
+ di->i_orphaned_slot = 0;
+ set_nlink(inode, 1);
+ ocfs2_set_links_count(di, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, di_bh);
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ OCFS2_I(inode)->ip_blkno, parent_di_bh,
+ &lookup);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ status = ocfs2_dentry_attach_lock(dentry, inode,
+ OCFS2_I(dir)->ip_blkno);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ d_instantiate(dentry, inode);
+ status = 0;
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+leave:
+
+ ocfs2_inode_unlock(dir, 1);
+
+ brelse(di_bh);
+ brelse(parent_di_bh);
+ brelse(orphan_dir_bh);
+
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
+
return status;
}
@@ -2042,4 +2658,7 @@ const struct inode_operations ocfs2_dir_iops = {
.getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
+ .fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef64c87..e5d059d4f11 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct inode *orphan_dir_inode,
struct inode *inode,
struct buffer_head *orphan_dir_bh);
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+ int mode,
+ struct inode **new_inode);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct inode *new_inode,
+ struct dentry *new_dentry);
#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281950d..bbec539230f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,37 +30,70 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
+#include <linux/llist.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mutex.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/lockdep.h>
+#include <linux/jbd2.h>
/* For union ocfs2_dlm_lksb */
#include "stackglue.h"
#include "ocfs2_fs.h"
#include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
+
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
+
+#include "reservations.h"
+
+/* Caching of metadata buffers */
/* Most user visible OCFS2 inodes will have very few pieces of
* metadata, but larger files (including bitmaps, etc) must be taken
* into account when designing an access scheme. We allow a small
* amount of inlined blocks to be stored on an array and grow the
* structure into a rb tree when necessary. */
-#define OCFS2_INODE_MAX_CACHE_ARRAY 2
+#define OCFS2_CACHE_INFO_MAX_ARRAY 2
+
+/* Flags for ocfs2_caching_info */
+enum ocfs2_caching_info_flags {
+ /* Indicates that the metadata cache is using the inline array */
+ OCFS2_CACHE_FL_INLINE = 1<<1,
+};
+
+struct ocfs2_caching_operations;
struct ocfs2_caching_info {
+ /*
+ * The parent structure provides the locks, but because the
+ * parent structure can differ, it provides locking operations
+ * to struct ocfs2_caching_info.
+ */
+ const struct ocfs2_caching_operations *ci_ops;
+
+ /* next two are protected by trans_inc_lock */
+ /* which transaction were we created on? Zero if none. */
+ unsigned long ci_created_trans;
+ /* last transaction we were a part of. */
+ unsigned long ci_last_trans;
+
+ /* Cache structures */
+ unsigned int ci_flags;
unsigned int ci_num_cached;
union {
- sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
+ sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
struct rb_root ci_tree;
} ci_cache;
};
+/*
+ * Need this prototype here instead of in uptodate.h because journal.h
+ * uses it.
+ */
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
/* this limits us to 256 nodes
* if we need more, we can do a kmalloc for the map */
@@ -106,49 +139,84 @@ enum ocfs2_unlock_action {
#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
call to dlm_lock. Only
exists with BUSY set. */
+#define OCFS2_LOCK_UPCONVERT_FINISHING (0x00000800) /* blocks the dc thread
+ * from downconverting
+ * before the upconvert
+ * has completed */
struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+ u64 ls_total; /* Total wait in NSEC */
+ u32 ls_gets; /* Num acquires */
+ u32 ls_fail; /* Num failed acquires */
+
+ /* Storing max wait in usecs saves 24 bytes per inode */
+ u32 ls_max; /* Max wait in USEC */
+};
+#endif
+
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
- spinlock_t l_lock;
+
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
- enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
- int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- union ocfs2_dlm_lksb l_lksb;
+ signed char l_level;
+ signed char l_requested;
+ signed char l_blocking;
+
+ /* Data packed - type enum ocfs2_lock_type */
+ unsigned char l_type;
/* used from AST/BAST funcs. */
- enum ocfs2_ast_action l_action;
- enum ocfs2_unlock_action l_unlock_action;
- int l_requested;
- int l_blocking;
+ /* Data packed - enum type ocfs2_ast_action */
+ unsigned char l_action;
+ /* Data packed - enum type ocfs2_unlock_action */
+ unsigned char l_unlock_action;
unsigned int l_pending_gen;
+ spinlock_t l_lock;
+
+ struct ocfs2_dlm_lksb l_lksb;
+
wait_queue_head_t l_event;
struct list_head l_debug_list;
#ifdef CONFIG_OCFS2_FS_STATS
- unsigned long long l_lock_num_prmode; /* PR acquires */
- unsigned long long l_lock_num_exmode; /* EX acquires */
- unsigned int l_lock_num_prmode_failed; /* Failed PR gets */
- unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
- unsigned long long l_lock_total_prmode; /* Tot wait for PR */
- unsigned long long l_lock_total_exmode; /* Tot wait for EX */
- unsigned int l_lock_max_prmode; /* Max wait for PR */
- unsigned int l_lock_max_exmode; /* Max wait for EX */
- unsigned int l_lock_refresh; /* Disk refreshes */
+ struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */
+ u32 l_lock_refresh; /* Disk refreshes */
+ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */
#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map l_lockdep_map;
+#endif
+};
+
+enum ocfs2_orphan_scan_state {
+ ORPHAN_SCAN_ACTIVE,
+ ORPHAN_SCAN_INACTIVE
+};
+
+struct ocfs2_orphan_scan {
+ struct mutex os_lock;
+ struct ocfs2_super *os_osb;
+ struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
+ struct delayed_work os_orphan_scan_work;
+ struct timespec os_scantime; /* time this node ran the scan */
+ u32 os_count; /* tracks node specific scans */
+ u32 os_seqno; /* tracks cluster wide scans */
+ atomic_t os_state; /* ACTIVE or INACTIVE */
};
struct ocfs2_dlm_debug {
@@ -188,7 +256,7 @@ enum ocfs2_local_alloc_state
enum ocfs2_mount_options
{
- OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
+ OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -196,9 +264,15 @@ enum ocfs2_mount_options
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
- OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
- OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
- OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
+ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
+ OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access
+ control lists */
+ OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+ OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+ writes */
+ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -211,14 +285,14 @@ struct ocfs2_slot_info;
struct ocfs2_recovery_map;
struct ocfs2_replay_map;
struct ocfs2_quota_recovery;
-struct ocfs2_dentry_lock;
struct ocfs2_super
{
struct task_struct *commit_task;
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
- struct inode *system_inodes[NUM_SYSTEM_INODES];
+ struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+ struct inode **local_system_inodes;
struct ocfs2_slot_info *slot_info;
@@ -248,7 +322,9 @@ struct ocfs2_super
u32 s_next_generation;
unsigned long osb_flags;
s16 s_inode_steal_slot;
+ s16 s_meta_steal_slot;
atomic_t s_num_inodes_stolen;
+ atomic_t s_num_meta_stolen;
unsigned long s_mount_opt;
unsigned int s_atime_quantum;
@@ -269,7 +345,6 @@ struct ocfs2_super
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
- atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
@@ -281,6 +356,9 @@ struct ocfs2_super
*/
unsigned int local_alloc_bits;
unsigned int local_alloc_default_bits;
+ /* osb_clusters_at_boot can become stale! Do not trust it to
+ * be up to date. */
+ unsigned int osb_clusters_at_boot;
enum ocfs2_local_alloc_state local_alloc_state; /* protected
* by osb_lock */
@@ -289,16 +367,25 @@ struct ocfs2_super
u64 la_last_gd;
+ struct ocfs2_reservation_map osb_la_resmap;
+
+ unsigned int osb_resv_level;
+ unsigned int osb_dir_resv_level;
+
/* Next three fields are for local node slot recovery during
* mount. */
int dirty;
struct ocfs2_dinode *local_alloc_copy;
struct ocfs2_quota_recovery *quota_rec;
+ struct ocfs2_blockcheck_stats osb_ecc_stats;
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
+ u8 osb_stackflags;
+
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
@@ -325,10 +412,9 @@ struct ocfs2_super
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
- /* List of dentry locks to release. Anyone can add locks to
- * the list, ocfs2_wq processes the list */
- struct ocfs2_dentry_lock *dentry_lock_list;
- struct work_struct dentry_lock_work;
+ /* List of dquot structures to drop last reference to */
+ struct llist_head dquot_drop_list;
+ struct work_struct dquot_drop_work;
wait_queue_head_t osb_mount_event;
@@ -336,11 +422,19 @@ struct ocfs2_super
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct delayed_work osb_truncate_log_wq;
+ atomic_t osb_tl_disable;
+ /*
+ * How many clusters in our truncate log.
+ * It must be protected by osb_tl_inode->i_mutex.
+ */
+ unsigned int truncated_clusters;
struct ocfs2_node_map osb_recovering_orphan_dirs;
unsigned int *osb_orphan_wipes;
wait_queue_head_t osb_wipe_event;
+ struct ocfs2_orphan_scan osb_orphan_scan;
+
/* used to protect metaecc calculation check of xattr. */
spinlock_t osb_xattr_lock;
@@ -349,12 +443,19 @@ struct ocfs2_super
/* the group we used to allocate inodes. */
u64 osb_inode_alloc_group;
+
+ /* rb tree root for refcount lock. */
+ struct rb_root osb_rf_lock_tree;
+ struct ocfs2_refcount_tree *osb_ref_tree_lru;
+
+ struct mutex system_file_mutex;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
/* Useful typedef for passing around journal access functions */
-typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+typedef int (*ocfs2_journal_access_func)(handle_t *handle,
+ struct ocfs2_caching_info *ci,
struct buffer_head *bh, int type);
static inline int ocfs2_should_order_data(struct inode *inode)
@@ -414,6 +515,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+ return 1;
+ return 0;
+}
+
static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
{
if (ocfs2_supports_indexed_dirs(osb))
@@ -452,6 +560,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
ocfs2_set_links_count(di, links);
}
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -498,10 +613,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
return ret;
}
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat &
- OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+ (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+ OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+ return ocfs2_o2cb_stack(osb) &&
+ (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
}
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
@@ -538,6 +678,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
#define OCFS2_IS_VALID_DX_LEAF(ptr) \
(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
+ (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
+
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
@@ -673,37 +816,73 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
-static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+ unsigned int clusters)
{
- spin_lock(&osb->osb_lock);
- osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
- spin_unlock(&osb->osb_lock);
- atomic_set(&osb->s_num_inodes_stolen, 0);
+ return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
-static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
- s16 slot)
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
{
- spin_lock(&osb->osb_lock);
- osb->s_inode_steal_slot = slot;
- spin_unlock(&osb->osb_lock);
+ __set_bit_le(bit, bitmap);
}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
-static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
{
- s16 slot;
+ __clear_bit_le(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
- spin_lock(&osb->osb_lock);
- slot = osb->s_inode_steal_slot;
- spin_unlock(&osb->osb_lock);
+#define ocfs2_test_bit test_bit_le
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
+#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_clear_bit(bit, bitmap);
+}
- return slot;
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+ int start)
+{
+ int fix = 0, ret, tmpmax;
+ bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
-#define ocfs2_test_bit ext2_test_bit
-#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
-#define ocfs2_find_next_bit ext2_find_next_bit
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e5e77..938387a10d5 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -98,7 +99,10 @@
| OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
| OCFS2_FEATURE_INCOMPAT_XATTR \
| OCFS2_FEATURE_INCOMPAT_META_ECC \
- | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -160,6 +164,19 @@
/* Metadata checksum and error correction */
#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
+
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
+
+/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -223,74 +240,43 @@
#define OCFS2_HAS_XATTR_FL (0x0002)
#define OCFS2_INLINE_XATTR_FL (0x0004)
#define OCFS2_INDEXED_DIR_FL (0x0008)
+#define OCFS2_HAS_REFCOUNT_FL (0x0010)
/* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
-#define OCFS2_UNRM_FL (0x00000002) /* Undelete */
-#define OCFS2_COMPR_FL (0x00000004) /* Compress file */
-#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */
-#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */
-#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */
-#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */
-#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */
-
-#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */
-#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
+#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
+#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
+#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
+#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
+#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
+#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
+#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
+#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
+#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
+#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
/*
* Extent record flags (e_node.leaf.flags)
*/
-#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
- * unwritten */
-
-/*
- * ioctl commands
- */
-#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
-
-/*
- * Space reservation / allocation / free ioctls and argument structure
- * are designed to be compatible with XFS.
- *
- * ALLOCSP* and FREESP* are not and will never be supported, but are
- * included here for completeness.
- */
-struct ocfs2_space_resv {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start;
- __s64 l_len; /* len == 0 means until end of file */
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-};
-
-#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
-#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
-
-/* Used to pass group descriptor data when online resize is done */
-struct ocfs2_new_group_input {
- __u64 group; /* Group descriptor's blkno. */
- __u32 clusters; /* Total number of clusters in this group */
- __u32 frees; /* Total free clusters in this group */
- __u16 chain; /* Chain for this group */
- __u16 reserved1;
- __u32 reserved2;
-};
-
-#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
-#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
-#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
+#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
+ * unwritten */
+#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference
+ * counted in an associated
+ * refcount tree */
/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
@@ -314,27 +300,27 @@ struct ocfs2_new_group_input {
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
#define OCFS2_STACK_LABEL_LEN 4
#define OCFS2_CLUSTER_NAME_LEN 16
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
+
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
/*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
-
-/*
* Inline extended attribute size (in bytes)
* The value chosen should be aligned to 16 byte boundaries.
*/
#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -352,6 +338,7 @@ enum {
USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE,
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
@@ -360,8 +347,12 @@ enum {
TRUNCATE_LOG_SYSTEM_INODE,
LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
NUM_SYSTEM_INODES
};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES \
+ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Global system inodes (single copy) */
@@ -390,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Parameter passed from mount.ocfs2 to module */
#define OCFS2_HB_NONE "heartbeat=none"
#define OCFS2_HB_LOCAL "heartbeat=local"
+#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
* OCFS2 directory file types. Only the low 3 bits are used. The
@@ -449,7 +441,7 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
struct ocfs2_block_check {
/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
__le16 bc_ecc; /* Single-error-correction parity vector.
- This is a simple Hamming code dependant
+ This is a simple Hamming code dependent
on the blocksize. OCFS2's maximum
blocksize, 4K, requires 16 parity bits,
so we fit in __le16. */
@@ -551,7 +543,10 @@ struct ocfs2_extent_block
block group */
__le32 h_fs_generation; /* Must match super block */
__le64 h_blkno; /* Offset on disk, in blocks */
-/*20*/ __le64 h_reserved3;
+/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
+ eb belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
__le64 h_next_leaf_blk; /* Offset on disk, in blocks,
of next leaf header pointing
to data */
@@ -593,9 +588,21 @@ struct ocfs2_slot_map_extended {
*/
};
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
struct ocfs2_cluster_info {
/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
- __le32 ci_reserved;
+ union {
+ __le32 ci_reserved;
+ struct {
+ __u8 ci_stackflags;
+ __u8 ci_reserved1;
+ __u8 ci_reserved2;
+ __u8 ci_reserved3;
+ };
+ };
/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
/*18*/
};
@@ -632,9 +639,9 @@ struct ocfs2_super_block {
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
-/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
- stack. Only valid
- with INCOMPAT flag. */
+/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+ userspace or clusterinfo
+ INCOMPAT flag set. */
/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
for this fs*/
__le16 s_reserved0;
@@ -717,7 +724,12 @@ struct ocfs2_dinode {
__le64 i_xattr_loc;
/*80*/ struct ocfs2_block_check i_check; /* Error checking */
/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
- __le64 i_reserved2[5];
+/*90*/ __le64 i_refcount_loc;
+ __le64 i_suballoc_loc; /* Suballocator block group this
+ inode belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
+/*A0*/ __le64 i_reserved2[3];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -738,7 +750,7 @@ struct ocfs2_dinode {
after an unclean
shutdown */
} journal1;
- } id1; /* Inode type dependant 1 */
+ } id1; /* Inode type dependent 1 */
/*C0*/ union {
struct ocfs2_super_block i_super;
struct ocfs2_local_alloc i_lab;
@@ -852,7 +864,12 @@ struct ocfs2_dx_root_block {
__le32 dr_reserved2;
__le64 dr_free_blk; /* Pointer to head of free
* unindexed block list. */
- __le64 dr_reserved3[15];
+ __le64 dr_suballoc_loc; /* Suballocator block group
+ this root belongs to.
+ Only valid if allocated
+ from a discontiguous
+ block group */
+ __le64 dr_reserved3[14];
union {
struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
* bits for maximum space
@@ -878,6 +895,13 @@ struct ocfs2_dx_leaf {
};
/*
+ * Largest bitmap for a block (suballocator) group in bytes. This limit
+ * does not affect cluster groups (global allocator). Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE 256
+
+/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
@@ -898,7 +922,87 @@ struct ocfs2_group_desc
__le64 bg_blkno; /* Offset on disk, in blocks */
/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
__le64 bg_reserved2;
-/*40*/ __u8 bg_bitmap[0];
+/*40*/ union {
+ __u8 bg_bitmap[0];
+ struct {
+ /*
+ * Block groups may be discontiguous when
+ * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+ * The extents of a discontigous block group are
+ * stored in bg_list. It is a flat list.
+ * l_tree_depth must always be zero. A
+ * discontiguous group is signified by a non-zero
+ * bg_list->l_next_free_rec. Only block groups
+ * can be discontiguous; Cluster groups cannot.
+ * We've never made a block group with more than
+ * 2048 blocks (256 bytes of bg_bitmap). This
+ * codifies that limit so that we can fit bg_list.
+ * bg_size of a discontiguous block group will
+ * be 256 to match bg_bitmap_filler.
+ */
+ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/ struct ocfs2_extent_list bg_list;
+ };
+ };
+/* Actual on-disk size is one block */
+};
+
+struct ocfs2_refcount_rec {
+/*00*/ __le64 r_cpos; /* Physical offset, in clusters */
+ __le32 r_clusters; /* Clusters covered by this extent */
+ __le32 r_refcount; /* Reference count of this extent */
+/*10*/
+};
+#define OCFS2_32BIT_POS_MASK (0xffffffffULL)
+
+#define OCFS2_REFCOUNT_LEAF_FL (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/ __le16 rl_count; /* Maximum number of entries possible
+ in rl_records */
+ __le16 rl_used; /* Current number of used records */
+ __le32 rl_reserved2;
+ __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/ __u8 rf_signature[8]; /* Signature for verification */
+ __le16 rf_suballoc_slot; /* Slot suballocator this block
+ belongs to */
+ __le16 rf_suballoc_bit; /* Bit offset in suballocator
+ block group */
+ __le32 rf_fs_generation; /* Must match superblock */
+/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */
+ __le64 rf_parent; /* Parent block, only valid if
+ OCFS2_REFCOUNT_LEAF_FL is set in
+ rf_flags */
+/*20*/ struct ocfs2_block_check rf_check; /* Error checking */
+ __le64 rf_last_eb_blk; /* Pointer to last extent block */
+/*30*/ __le32 rf_count; /* Number of inodes sharing this
+ refcount tree */
+ __le32 rf_flags; /* See the flags above */
+ __le32 rf_clusters; /* clusters covered by refcount tree. */
+ __le32 rf_cpos; /* cluster offset in refcount tree.*/
+/*40*/ __le32 rf_generation; /* generation number. all be the same
+ * for the same refcount tree. */
+ __le32 rf_reserved0;
+ __le64 rf_suballoc_loc; /* Suballocator block group this
+ refcount block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
+/*50*/ __le64 rf_reserved1[6];
+/*80*/ union {
+ struct ocfs2_refcount_list rf_records; /* List of refcount
+ records */
+ struct ocfs2_extent_list rf_list; /* Extent record list,
+ only valid if
+ OCFS2_REFCOUNT_TREE_FL
+ is set in rf_flags */
+ };
+/* Actual on-disk size is one block */
};
/*
@@ -915,7 +1019,7 @@ struct ocfs2_xattr_entry {
__le16 xe_name_offset; /* byte offset from the 1st entry in the
local xattr storage(inode, xattr block or
xattr bucket). */
- __u8 xe_name_len; /* xattr name len, does't include prefix. */
+ __u8 xe_name_len; /* xattr name len, doesn't include prefix. */
__u8 xe_type; /* the low 7 bits indicate the name prefix
* type and the highest bit indicates whether
* the EA is stored in the local storage. */
@@ -1001,7 +1105,10 @@ struct ocfs2_xattr_block {
real xattr or a xattr tree. */
__le16 xb_reserved0;
__le32 xb_reserved1;
- __le64 xb_reserved2;
+ __le64 xb_suballoc_loc; /* Suballocator block group this
+ xattr block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
/*30*/ union {
struct ocfs2_xattr_header xb_header; /* xattr header if this
block contains xattr */
@@ -1129,7 +1236,7 @@ struct ocfs2_local_disk_dqinfo {
/* Header of one chunk of a quota file */
struct ocfs2_local_disk_chunk {
__le32 dqc_free; /* Number of free entries in the bitmap */
- u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
+ __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
* chunk of quota file */
};
@@ -1238,6 +1345,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
{
int size;
@@ -1268,13 +1385,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
return size;
}
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+ int suballocator,
+ u32 feature_incompat)
{
- int size;
-
- size = sb->s_blocksize -
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -1312,15 +1439,48 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+ return size / sizeof(struct ocfs2_refcount_rec);
+}
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+ return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
#else
static inline int ocfs2_fast_symlink_chars(int blocksize)
{
return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
}
-static inline int ocfs2_max_inline_data(int blocksize)
+static inline int ocfs2_max_inline_data_with_xattr(int blocksize,
+ struct ocfs2_dinode *di)
{
- return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+ if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+ di->i_xattr_inline_size;
+ else
+ return blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data);
}
static inline int ocfs2_extent_recs_per_inode(int blocksize)
@@ -1353,23 +1513,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
{
int size;
size = blocksize -
- offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
- return size;
+ return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
{
int size;
size = blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+ return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+ int suballocator,
+ uint32_t feature_incompat)
+{
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -1442,5 +1622,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+ if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+ le16_to_cpu(gd->bg_size)) !=
+ offsetof(struct ocfs2_group_desc, bg_list))
+ return 0;
+ /*
+ * Only valid to check l_next_free_rec if
+ * bg_bitmap + bg_size == bg_list.
+ */
+ if (!gd->bg_list.l_next_free_rec)
+ return 0;
+ return 1;
+}
#endif /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 00000000000..5b27ff1fa57
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,242 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start;
+ __s64 l_len; /* len == 0 means until end of file */
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+};
+
+#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
+
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+ __u64 group; /* Group descriptor's blkno. */
+ __u32 clusters; /* Total number of clusters in this group */
+ __u32 frees; /* Total free clusters in this group */
+ __u16 chain; /* Chain for this group */
+ __u16 reserved1;
+ __u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND _IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
+
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+ __u64 old_path;
+ __u64 new_path;
+ __u64 preserve;
+};
+#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST (50)
+#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+ __u64 oi_requests; /* Array of __u64 pointers to requests */
+ __u32 oi_count; /* Number of requests in info_requests */
+ __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+ __u32 ir_code; /* Info request code */
+ __u32 ir_size; /* Size of request */
+ __u32 ir_flags; /* Request flags */
+/*10*/ /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+ struct ocfs2_info_request ic_req;
+ __u32 ic_clustersize;
+ __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+ struct ocfs2_info_request ib_req;
+ __u32 ib_blocksize;
+ __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+ struct ocfs2_info_request im_req;
+ __u32 im_max_slots;
+ __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+ struct ocfs2_info_request il_req;
+ __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+ struct ocfs2_info_request iu_req;
+ __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+ struct ocfs2_info_request if_req;
+ __u32 if_compat_features;
+ __u32 if_incompat_features;
+ __u32 if_ro_compat_features;
+ __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+ struct ocfs2_info_request ij_req;
+ __u64 ij_journal_size;
+};
+
+struct ocfs2_info_freeinode {
+ struct ocfs2_info_request ifi_req;
+ struct ocfs2_info_local_freeinode {
+ __u64 lfi_total;
+ __u64 lfi_free;
+ } ifi_stat[OCFS2_MAX_SLOTS];
+ __u32 ifi_slotnum; /* out */
+ __u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST (32)
+
+struct ocfs2_info_freefrag {
+ struct ocfs2_info_request iff_req;
+ struct ocfs2_info_freefrag_stats { /* (out) */
+ struct ocfs2_info_free_chunk_list {
+ __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+ __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+ } ffs_fc_hist;
+ __u32 ffs_clusters;
+ __u32 ffs_free_clusters;
+ __u32 ffs_free_chunks;
+ __u32 ffs_free_chunks_real;
+ __u32 ffs_min; /* Minimum free chunksize in clusters */
+ __u32 ffs_max;
+ __u32 ffs_avg;
+ __u32 ffs_pad;
+ } iff_ffs;
+ __u32 iff_chunksize; /* chunksize in clusters(in) */
+ __u32 iff_pad;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+ OCFS2_INFO_CLUSTERSIZE = 1,
+ OCFS2_INFO_BLOCKSIZE,
+ OCFS2_INFO_MAXSLOTS,
+ OCFS2_INFO_LABEL,
+ OCFS2_INFO_UUID,
+ OCFS2_INFO_FS_FEATURES,
+ OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_FREEINODE,
+ OCFS2_INFO_FREEFRAG,
+ OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
+ required. This is a hint.
+ It is up to ocfs2 whether
+ the request can be fulfilled
+ without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
+ this request and
+ filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
+ request handling. */
+
+#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+
+struct ocfs2_move_extents {
+/* All values are in bytes */
+ /* in */
+ __u64 me_start; /* Virtual start in the file to move */
+ __u64 me_len; /* Length of the extents to be moved */
+ __u64 me_goal; /* Physical offset of the goal,
+ it's in block unit */
+ __u64 me_threshold; /* Maximum distance from goal or threshold
+ for auto defragmentation */
+ __u64 me_flags; /* Flags for the operation:
+ * - auto defragmentation.
+ * - refcount,xattr cases.
+ */
+ /* out */
+ __u64 me_moved_len; /* Moved/defraged length */
+ __u64 me_new_offset; /* Resulting physical location */
+ __u32 me_reserved[2]; /* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
+ claim new clusters
+ as the goal place
+ for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
+ moving, is to make
+ movement less likely
+ to fail, may make fs
+ even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
+ completely gets done.
+ */
+
+#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
+
+#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87481b..d277aabf5df 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,8 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
OCFS2_LOCK_TYPE_NFS_SYNC,
+ OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+ OCFS2_LOCK_TYPE_REFCOUNT,
OCFS2_NUM_LOCK_TYPES
};
@@ -85,6 +87,12 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_NFS_SYNC:
c = 'Y';
break;
+ case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+ c = 'P';
+ break;
+ case OCFS2_LOCK_TYPE_REFCOUNT:
+ c = 'T';
+ break;
default:
c = '\0';
}
@@ -104,6 +112,9 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+ [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
+ [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
+ [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0ff..2e45c8d2ea7 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
/*
* The protocol version for ocfs2 cluster locking. See dlmglue.c for
* more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
*/
#define OCFS2_LOCKING_PROTOCOL_MAJOR 1
#define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 00000000000..6cb019b7c6a
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2768 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocfs2__int,
+ TP_PROTO(int num),
+ TP_ARGS(num),
+ TP_STRUCT__entry(
+ __field(int, num)
+ ),
+ TP_fast_assign(
+ __entry->num = num;
+ ),
+ TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__int, name, \
+ TP_PROTO(int num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__uint,
+ TP_PROTO(unsigned int num),
+ TP_ARGS(num),
+ TP_STRUCT__entry(
+ __field( unsigned int, num )
+ ),
+ TP_fast_assign(
+ __entry->num = num;
+ ),
+ TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint, name, \
+ TP_PROTO(unsigned int num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__ull,
+ TP_PROTO(unsigned long long blkno),
+ TP_ARGS(blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull, name, \
+ TP_PROTO(unsigned long long num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+ TP_PROTO(void *pointer),
+ TP_ARGS(pointer),
+ TP_STRUCT__entry(
+ __field(void *, pointer)
+ ),
+ TP_fast_assign(
+ __entry->pointer = pointer;
+ ),
+ TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name) \
+DEFINE_EVENT(ocfs2__pointer, name, \
+ TP_PROTO(void *pointer), \
+ TP_ARGS(pointer))
+
+DECLARE_EVENT_CLASS(ocfs2__string,
+ TP_PROTO(const char *name),
+ TP_ARGS(name),
+ TP_STRUCT__entry(
+ __string(name,name)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ ),
+ TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name) \
+DEFINE_EVENT(ocfs2__string, name, \
+ TP_PROTO(const char *name), \
+ TP_ARGS(name))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+ TP_PROTO(int value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(int, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__int_int, name, \
+ TP_PROTO(int val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+ TP_PROTO(unsigned int value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned int, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_int, name, \
+ TP_PROTO(unsigned int val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+ TP_PROTO(unsigned int value1, unsigned int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint, name, \
+ TP_PROTO(unsigned int val1, unsigned int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+ TP_PROTO(unsigned long long value1, unsigned int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint, name, \
+ TP_PROTO(unsigned long long val1, unsigned int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+ TP_PROTO(unsigned long long value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_int, name, \
+ TP_PROTO(unsigned long long val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+ TP_PROTO(unsigned long long value1, unsigned long long value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull, name, \
+ TP_PROTO(unsigned long long val1, unsigned long long val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+ TP_PROTO(unsigned long long value1,
+ unsigned long long value2, unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %llu %u",
+ __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_uint, name, \
+ TP_PROTO(unsigned long long val1, \
+ unsigned long long val2, unsigned int val3), \
+ TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+ TP_PROTO(unsigned long long value1,
+ unsigned int value2, unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned int, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %u %u", __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint_uint, name, \
+ TP_PROTO(unsigned long long val1, \
+ unsigned int val2, unsigned int val3), \
+ TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+ TP_PROTO(unsigned int value1, unsigned int value2,
+ unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field( unsigned int, value1 )
+ __field( unsigned int, value2 )
+ __field( unsigned int, value3 )
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint_uint, name, \
+ TP_PROTO(unsigned int value1, unsigned int value2, \
+ unsigned int value3), \
+ TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+ TP_PROTO(unsigned long long value1,
+ unsigned long long value2, unsigned long long value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned long long, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %llu %llu",
+ __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_ull, name, \
+ TP_PROTO(unsigned long long value1, unsigned long long value2, \
+ unsigned long long value3), \
+ TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+ TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+ TP_ARGS(ull, value1, value2, value3),
+ TP_STRUCT__entry(
+ __field( unsigned long long, ull )
+ __field( int, value1 )
+ __field( int, value2 )
+ __field( int, value3 )
+ ),
+ TP_fast_assign(
+ __entry->ull = ull;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %d %d %d",
+ __entry->ull, __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_int_int_int, name, \
+ TP_PROTO(unsigned long long ull, int value1, \
+ int value2, int value3), \
+ TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+ TP_PROTO(unsigned long long ull, unsigned int value1,
+ unsigned int value2, unsigned int value3),
+ TP_ARGS(ull, value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ull)
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->ull = ull;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->ull, __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \
+ TP_PROTO(unsigned long long ull, unsigned int value1, \
+ unsigned int value2, unsigned int value3), \
+ TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+ TP_PROTO(unsigned long long value1, unsigned long long value2,
+ unsigned int value3, unsigned int value4),
+ TP_ARGS(value1, value2, value3, value4),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned int, value3)
+ __field(unsigned int, value4)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ __entry->value4 = value4;
+ ),
+ TP_printk("%llu %llu %u %u",
+ __entry->value1, __entry->value2,
+ __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \
+ TP_PROTO(unsigned long long ull, unsigned long long ull1, \
+ unsigned int value2, unsigned int value3), \
+ TP_ARGS(ull, ull1, value2, value3))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+ TP_PROTO(unsigned long long owner,\
+ unsigned int value1, unsigned int value2),
+ TP_ARGS(owner, value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %u %u",
+ __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name) \
+DEFINE_EVENT(ocfs2__btree_ops, name, \
+ TP_PROTO(unsigned long long owner, \
+ unsigned int value1, unsigned int value2), \
+ TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+ TP_PROTO(unsigned long long owner, int depth),
+ TP_ARGS(owner, depth),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(int, depth)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->depth = depth;
+ ),
+ TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+ TP_PROTO(int subtree_root, unsigned long long blkno,
+ int depth),
+ TP_ARGS(subtree_root, blkno, depth),
+ TP_STRUCT__entry(
+ __field(int, subtree_root)
+ __field(unsigned long long, blkno)
+ __field(int, depth)
+ ),
+ TP_fast_assign(
+ __entry->subtree_root = subtree_root;
+ __entry->blkno = blkno;
+ __entry->depth = depth;
+ ),
+ TP_printk("%d %llu %d", __entry->subtree_root,
+ __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+ TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+ int ins_contig_index, int free_records, int ins_tree_depth),
+ TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+ ins_tree_depth),
+ TP_STRUCT__entry(
+ __field(unsigned int, ins_appending)
+ __field(unsigned int, ins_contig)
+ __field(int, ins_contig_index)
+ __field(int, free_records)
+ __field(int, ins_tree_depth)
+ ),
+ TP_fast_assign(
+ __entry->ins_appending = ins_appending;
+ __entry->ins_contig = ins_contig;
+ __entry->ins_contig_index = ins_contig_index;
+ __entry->free_records = free_records;
+ __entry->ins_tree_depth = ins_tree_depth;
+ ),
+ TP_printk("%u %u %d %d %d",
+ __entry->ins_appending, __entry->ins_contig,
+ __entry->ins_contig_index, __entry->free_records,
+ __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+ TP_PROTO(int split_index, unsigned int c_contig_type,
+ unsigned int c_has_empty_extent,
+ unsigned int c_split_covers_rec),
+ TP_ARGS(split_index, c_contig_type,
+ c_has_empty_extent, c_split_covers_rec),
+ TP_STRUCT__entry(
+ __field(int, split_index)
+ __field(unsigned int, c_contig_type)
+ __field(unsigned int, c_has_empty_extent)
+ __field(unsigned int, c_split_covers_rec)
+ ),
+ TP_fast_assign(
+ __entry->split_index = split_index;
+ __entry->c_contig_type = c_contig_type;
+ __entry->c_has_empty_extent = c_has_empty_extent;
+ __entry->c_split_covers_rec = c_split_covers_rec;
+ ),
+ TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+ __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+ TP_PROTO(unsigned long long owner, unsigned int cpos,
+ unsigned int len, int index,
+ unsigned int e_cpos, unsigned int clusters),
+ TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(int, index)
+ __field(unsigned int, e_cpos)
+ __field(unsigned int, clusters)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->index = index;
+ __entry->e_cpos = e_cpos;
+ __entry->clusters = clusters;
+ ),
+ TP_printk("%llu %u %u %d %u %u",
+ __entry->owner, __entry->cpos, __entry->len, __entry->index,
+ __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+ TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+ unsigned int clusters, unsigned int depth),
+ TP_ARGS(ino, new_cpos, clusters, depth),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, new_cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, depth)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->new_cpos = new_cpos;
+ __entry->clusters = clusters;
+ __entry->depth = depth;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->ino, __entry->new_cpos,
+ __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+ TP_PROTO(unsigned long long blkno),
+ TP_ARGS(blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+ TP_PROTO(unsigned int insert_cpos, int insert_index,
+ int has_empty, int next_free,
+ unsigned int l_count),
+ TP_ARGS(insert_cpos, insert_index, has_empty,
+ next_free, l_count),
+ TP_STRUCT__entry(
+ __field(unsigned int, insert_cpos)
+ __field(int, insert_index)
+ __field(int, has_empty)
+ __field(int, next_free)
+ __field(unsigned int, l_count)
+ ),
+ TP_fast_assign(
+ __entry->insert_cpos = insert_cpos;
+ __entry->insert_index = insert_index;
+ __entry->has_empty = has_empty;
+ __entry->next_free = next_free;
+ __entry->l_count = l_count;
+ ),
+ TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+ __entry->insert_index, __entry->has_empty,
+ __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+ TP_PROTO(int status, int reason, int err),
+ TP_ARGS(status, reason, err),
+ TP_STRUCT__entry(
+ __field(int, status)
+ __field(int, reason)
+ __field(int, err)
+ ),
+ TP_fast_assign(
+ __entry->status = status;
+ __entry->reason = reason;
+ __entry->err = err;
+ ),
+ TP_printk("%d %d %d", __entry->status,
+ __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+ TP_PROTO(unsigned long long owner, unsigned int cpos,
+ unsigned int len, unsigned int phys),
+ TP_ARGS(owner, cpos, len, phys),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(unsigned int, phys)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->phys = phys;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->owner, __entry->cpos,
+ __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+ TP_PROTO(unsigned long long blkno, int index,
+ unsigned int start, unsigned int num),
+ TP_ARGS(blkno, index, start, num),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __field(int, index)
+ __field(unsigned int, start)
+ __field(unsigned int, num)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __entry->index = index;
+ __entry->start = start;
+ __entry->num = num;
+ ),
+ TP_printk("%llu %d %u %u",
+ __entry->blkno, __entry->index,
+ __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \
+DEFINE_EVENT(ocfs2__truncate_log_ops, name, \
+ TP_PROTO(unsigned long long blkno, int index, \
+ unsigned int start, unsigned int num), \
+ TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+ TP_PROTO(int type, int slot, unsigned long long suballoc,
+ unsigned long long blkno, unsigned int bit),
+ TP_ARGS(type, slot, suballoc, blkno, bit),
+ TP_STRUCT__entry(
+ __field(int, type)
+ __field(int, slot)
+ __field(unsigned long long, suballoc)
+ __field(unsigned long long, blkno)
+ __field(unsigned int, bit)
+ ),
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->slot = slot;
+ __entry->suballoc = suballoc;
+ __entry->blkno = blkno;
+ __entry->bit = bit;
+ ),
+ TP_printk("%d %d %llu %llu %u",
+ __entry->type, __entry->slot, __entry->suballoc,
+ __entry->blkno, __entry->bit)
+);
+
+TRACE_EVENT(ocfs2_trim_extent,
+ TP_PROTO(struct super_block *sb, unsigned long long blk,
+ unsigned long long count),
+ TP_ARGS(sb, blk, count),
+ TP_STRUCT__entry(
+ __field(int, dev_major)
+ __field(int, dev_minor)
+ __field(unsigned long long, blk)
+ __field(__u64, count)
+ ),
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(sb->s_dev);
+ __entry->dev_minor = MINOR(sb->s_dev);
+ __entry->blk = blk;
+ __entry->count = count;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->dev_major, __entry->dev_minor,
+ __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
+
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+ TP_PROTO(int count, int bit, unsigned long long start_blk,
+ unsigned long long blkno),
+ TP_ARGS(count, bit, start_blk, blkno),
+ TP_STRUCT__entry(
+ __field(int, count)
+ __field(int, bit)
+ __field(unsigned long long, start_blk)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->count = count;
+ __entry->bit = bit;
+ __entry->start_blk = start_blk;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->count, __entry->bit, __entry->start_blk,
+ __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+ TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+ unsigned long long bg_blkno,
+ unsigned long long prev_blkno),
+ TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, i_blkno)
+ __field(unsigned int, chain)
+ __field(unsigned long long, bg_blkno)
+ __field(unsigned long long, prev_blkno)
+ ),
+ TP_fast_assign(
+ __entry->i_blkno = i_blkno;
+ __entry->chain = chain;
+ __entry->bg_blkno = bg_blkno;
+ __entry->prev_blkno = prev_blkno;
+ ),
+ TP_printk("%llu %u %llu %llu",
+ __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+ __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+ TP_PROTO(unsigned long long inode, unsigned long long group,
+ unsigned int start_bit, unsigned int count),
+ TP_ARGS(inode, group, start_bit, count),
+ TP_STRUCT__entry(
+ __field(unsigned long long, inode)
+ __field(unsigned long long, group)
+ __field(unsigned int, start_bit)
+ __field(unsigned int, count)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->group = group;
+ __entry->start_bit = start_bit;
+ __entry->count = count;
+ ),
+ TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+ __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+ TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+ unsigned int start_bit, unsigned int count),
+ TP_ARGS(bg_blkno, start_blk, start_bit, count),
+ TP_STRUCT__entry(
+ __field(unsigned long long, bg_blkno)
+ __field(unsigned long long, start_blk)
+ __field(unsigned int, start_bit)
+ __field(unsigned int, count)
+ ),
+ TP_fast_assign(
+ __entry->bg_blkno = bg_blkno;
+ __entry->start_blk = start_blk;
+ __entry->start_bit = start_bit;
+ __entry->count = count;
+ ),
+ TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+ __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+ TP_PROTO(unsigned long long blkno, int index,
+ unsigned long long cpos,
+ unsigned int clusters, unsigned int refcount),
+ TP_ARGS(blkno, index, cpos, clusters, refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __field(int, index)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, refcount)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __entry->index = index;
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->refcount = refcount;
+ ),
+ TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+ __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \
+ TP_PROTO(unsigned long long blkno, int index, \
+ unsigned long long cpos, \
+ unsigned int count, unsigned int refcount), \
+ TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+ TP_PROTO(unsigned long long cpos,
+ unsigned int clusters, unsigned int refcount,
+ unsigned long long split_cpos,
+ unsigned int split_clusters, unsigned int split_refcount),
+ TP_ARGS(cpos, clusters, refcount,
+ split_cpos, split_clusters, split_refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, refcount)
+ __field(unsigned long long, split_cpos)
+ __field(unsigned int, split_clusters)
+ __field(unsigned int, split_refcount)
+ ),
+ TP_fast_assign(
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->refcount = refcount;
+ __entry->split_cpos = split_cpos;
+ __entry->split_clusters = split_clusters;
+ __entry->split_refcount = split_refcount;
+ ),
+ TP_printk("%llu %u %u %llu %u %u",
+ __entry->cpos, __entry->clusters, __entry->refcount,
+ __entry->split_cpos, __entry->split_clusters,
+ __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+ TP_PROTO(unsigned long long owner,
+ unsigned long long cpos,
+ unsigned int len, int delete),
+ TP_ARGS(owner, cpos, len, delete),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, len)
+ __field(int, delete)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->delete = delete;
+ ),
+ TP_printk("%llu %llu %u %d",
+ __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+ TP_PROTO(int recs_add, unsigned long long cpos,
+ unsigned int clusters, unsigned long long r_cpos,
+ unsigned int r_clusters, unsigned int refcount, int index),
+ TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+ TP_STRUCT__entry(
+ __field(int, recs_add)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned long long, r_cpos)
+ __field(unsigned int, r_clusters)
+ __field(unsigned int, refcount)
+ __field(int, index)
+ ),
+ TP_fast_assign(
+ __entry->recs_add = recs_add;
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->r_cpos = r_cpos;
+ __entry->r_clusters = r_clusters;
+ __entry->refcount = refcount;
+ __entry->index = index;
+ ),
+ TP_printk("%d %llu %u %llu %u %u %d",
+ __entry->recs_add, __entry->cpos, __entry->clusters,
+ __entry->r_cpos, __entry->r_clusters,
+ __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int len, unsigned int p_cluster,
+ unsigned int ext_flags),
+ TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(unsigned int, p_cluster)
+ __field(unsigned int, ext_flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->p_cluster = p_cluster;
+ __entry->ext_flags = ext_flags;
+ ),
+ TP_printk("%llu %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->len,
+ __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int old, unsigned int new, unsigned int len,
+ unsigned int ext_flags),
+ TP_ARGS(ino, cpos, old, new, len, ext_flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, old)
+ __field(unsigned int, new)
+ __field(unsigned int, len)
+ __field(unsigned int, ext_flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->old = old;
+ __entry->new = new;
+ __entry->len = len;
+ __entry->ext_flags = ext_flags;
+ ),
+ TP_printk("%llu %u %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->old, __entry->new,
+ __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int write_len, unsigned int max_cpos,
+ unsigned int cow_start, unsigned int cow_len),
+ TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, write_len)
+ __field(unsigned int, max_cpos)
+ __field(unsigned int, cow_start)
+ __field(unsigned int, cow_len)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->write_len = write_len;
+ __entry->max_cpos = max_cpos;
+ __entry->cow_start = cow_start;
+ __entry->cow_len = cow_len;
+ ),
+ TP_printk("%llu %u %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->write_len,
+ __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+ TP_PROTO(unsigned long long ino, unsigned long long iblock,
+ void *bh_result, int create),
+ TP_ARGS(ino, iblock, bh_result, create),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, iblock)
+ __field(void *, bh_result)
+ __field(int, create)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->iblock = iblock;
+ __entry->bh_result = bh_result;
+ __entry->create = create;
+ ),
+ TP_printk("%llu %llu %p %d",
+ __entry->ino, __entry->iblock,
+ __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \
+DEFINE_EVENT(ocfs2__get_block, name, \
+ TP_PROTO(unsigned long long ino, unsigned long long iblock, \
+ void *bh_result, int create), \
+ TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+ TP_PROTO(unsigned long long ino, unsigned int len,
+ unsigned long long pos, unsigned int flags),
+ TP_ARGS(ino, len, pos, flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, len)
+ __field(unsigned long long, pos)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->len = len;
+ __entry->pos = pos;
+ __entry->flags = flags;
+ ),
+ TP_printk("%llu %u %llu 0x%x",
+ __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+ TP_PROTO(unsigned long long ino,
+ long long i_size, unsigned int i_clusters,
+ unsigned long long pos, unsigned int len,
+ unsigned int flags, void *page,
+ unsigned int clusters, unsigned int extents_to_split),
+ TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+ page, clusters, extents_to_split),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(long long, i_size)
+ __field(unsigned int, i_clusters)
+ __field(unsigned long long, pos)
+ __field(unsigned int, len)
+ __field(unsigned int, flags)
+ __field(void *, page)
+ __field(unsigned int, clusters)
+ __field(unsigned int, extents_to_split)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->i_size = i_size;
+ __entry->i_clusters = i_clusters;
+ __entry->pos = pos;
+ __entry->len = len;
+ __entry->flags = flags;
+ __entry->page = page;
+ __entry->clusters = clusters;
+ __entry->extents_to_split = extents_to_split;
+ ),
+ TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+ __entry->ino, __entry->i_size, __entry->i_clusters,
+ __entry->pos, __entry->len,
+ __entry->flags, __entry->page, __entry->clusters,
+ __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+ TP_PROTO(unsigned long long ino,
+ unsigned long long pos, unsigned int copied,
+ unsigned int id_count, unsigned int features),
+ TP_ARGS(ino, pos, copied, id_count, features),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, pos)
+ __field(unsigned int, copied)
+ __field(unsigned int, id_count)
+ __field(unsigned int, features)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->pos = pos;
+ __entry->copied = copied;
+ __entry->id_count = id_count;
+ __entry->features = features;
+ ),
+ TP_printk("%llu %llu %u %u %u",
+ __entry->ino, __entry->pos, __entry->copied,
+ __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+ TP_PROTO(unsigned long long ino,
+ void *area, void *page, unsigned long pgoff),
+ TP_ARGS(ino, area, page, pgoff),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(void *, area)
+ __field(void *, page)
+ __field(unsigned long, pgoff)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->area = area;
+ __entry->page = page;
+ __entry->pgoff = pgoff;
+ ),
+ TP_printk("%llu %p %p %lu",
+ __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+ TP_PROTO(void *inode, void *file, void *dentry,
+ unsigned long long ino,
+ unsigned int d_len, const unsigned char *d_name,
+ unsigned long long para),
+ TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(void *, file)
+ __field(void *, dentry)
+ __field(unsigned long long, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, d_name)
+ __field(unsigned long long, para)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->file = file;
+ __entry->dentry = dentry;
+ __entry->ino = ino;
+ __entry->d_len = d_len;
+ __assign_str(d_name, d_name);
+ __entry->para = para;
+ ),
+ TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+ __entry->dentry, __entry->ino, __entry->para,
+ __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name) \
+DEFINE_EVENT(ocfs2__file_ops, name, \
+TP_PROTO(void *inode, void *file, void *dentry, \
+ unsigned long long ino, \
+ unsigned int d_len, const unsigned char *d_name, \
+ unsigned long long mode), \
+ TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+ TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+ unsigned int clusters, unsigned int clusters_to_add,
+ int why, int restart_func),
+ TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ip_blkno)
+ __field(unsigned long long, size)
+ __field(unsigned int, clusters)
+ __field(unsigned int, clusters_to_add)
+ __field(int, why)
+ __field(int, restart_func)
+ ),
+ TP_fast_assign(
+ __entry->ip_blkno = ip_blkno;
+ __entry->size = size;
+ __entry->clusters = clusters;
+ __entry->clusters_to_add = clusters_to_add;
+ __entry->why = why;
+ __entry->restart_func = restart_func;
+ ),
+ TP_printk("%llu %llu %u %u %d %d",
+ __entry->ip_blkno, __entry->size, __entry->clusters,
+ __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+ TP_PROTO(unsigned long long ino,
+ unsigned int di_clusters, unsigned long long di_size,
+ unsigned int ip_clusters, unsigned long long i_size),
+ TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, di_clusters)
+ __field(unsigned long long, di_size)
+ __field(unsigned int, ip_clusters)
+ __field(unsigned long long, i_size)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->di_clusters = di_clusters;
+ __entry->di_size = di_size;
+ __entry->ip_clusters = ip_clusters;
+ __entry->i_size = i_size;
+ ),
+ TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+ __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+ TP_PROTO(unsigned long long ino,
+ unsigned long long abs_from, unsigned long long abs_to,
+ unsigned long index, unsigned int zero_from,
+ unsigned int zero_to),
+ TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, abs_from)
+ __field(unsigned long long, abs_to)
+ __field(unsigned long, index)
+ __field(unsigned int, zero_from)
+ __field(unsigned int, zero_to)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->abs_from = abs_from;
+ __entry->abs_to = abs_to;
+ __entry->index = index;
+ __entry->zero_from = zero_from;
+ __entry->zero_to = zero_to;
+ ),
+ TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+ __entry->abs_from, __entry->abs_to,
+ __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+ TP_PROTO(void *inode, void *dentry,
+ unsigned long long ino,
+ unsigned int d_len, const unsigned char *d_name,
+ unsigned int ia_valid, unsigned int ia_mode,
+ unsigned int ia_uid, unsigned int ia_gid),
+ TP_ARGS(inode, dentry, ino, d_len, d_name,
+ ia_valid, ia_mode, ia_uid, ia_gid),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(void *, dentry)
+ __field(unsigned long long, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, d_name)
+ __field(unsigned int, ia_valid)
+ __field(unsigned int, ia_mode)
+ __field(unsigned int, ia_uid)
+ __field(unsigned int, ia_gid)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->dentry = dentry;
+ __entry->ino = ino;
+ __entry->d_len = d_len;
+ __assign_str(d_name, d_name);
+ __entry->ia_valid = ia_valid;
+ __entry->ia_mode = ia_mode;
+ __entry->ia_uid = ia_uid;
+ __entry->ia_gid = ia_gid;
+ ),
+ TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+ __entry->dentry, __entry->ino, __entry->d_len,
+ __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+ __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+ TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+ int appending, unsigned long count,
+ int *direct_io, int *has_refcount),
+ TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, saved_pos)
+ __field(int, appending)
+ __field(unsigned long, count)
+ __field(int, direct_io)
+ __field(int, has_refcount)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->saved_pos = saved_pos;
+ __entry->appending = appending;
+ __entry->count = count;
+ __entry->direct_io = direct_io ? *direct_io : -1;
+ __entry->has_refcount = has_refcount ? *has_refcount : -1;
+ ),
+ TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+ __entry->saved_pos, __entry->appending, __entry->count,
+ __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+ TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+ TP_ARGS(ino, flags, sysfile_type),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ __field(int, sysfile_type)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->flags = flags;
+ __entry->sysfile_type = sysfile_type;
+ ),
+ TP_printk("%llu %u %d", __entry->ino,
+ __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+ TP_PROTO(void *inode, unsigned long long ino),
+ TP_ARGS(inode, ino),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ ),
+ TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+ TP_PROTO(void *inode, unsigned long long ino,
+ void *args, unsigned long long fi_blkno),
+ TP_ARGS(inode, ino, args, fi_blkno),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ __field(void *, args)
+ __field(unsigned long long, fi_blkno)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->args = args;
+ __entry->fi_blkno = fi_blkno;
+ ),
+ TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+ __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+ TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+ unsigned int flags),
+ TP_ARGS(task, dc_task, ino, flags),
+ TP_STRUCT__entry(
+ __field(void *, task)
+ __field(void *, dc_task)
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->task = task;
+ __entry->dc_task = dc_task;
+ __entry->ino = ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+ __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+ TP_PROTO(void *inode, unsigned long long ino,
+ unsigned int flags),
+ TP_ARGS(inode, ino, flags),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+ TP_PROTO(void *inode, unsigned long long vblock, int nr,
+ void *bhs, unsigned int flags, void *validate),
+ TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, vblock)
+ __field(int, nr)
+ __field(void *, bhs)
+ __field(unsigned int, flags)
+ __field(void *, validate)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->vblock = vblock;
+ __entry->nr = nr;
+ __entry->bhs = bhs;
+ __entry->flags = flags;
+ __entry->validate = validate;
+ ),
+ TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+ __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+ TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+ TP_ARGS(s_flags, osb_flags, flags),
+ TP_STRUCT__entry(
+ __field(unsigned long, s_flags)
+ __field(unsigned long, osb_flags)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->s_flags = s_flags;
+ __entry->osb_flags = osb_flags;
+ __entry->flags = flags;
+ ),
+ TP_printk("%lu %lu %d", __entry->s_flags,
+ __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+ TP_PROTO(void *sb, void *data, int silent),
+ TP_ARGS(sb, data, silent),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, data)
+ __field(int, silent)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->data = data;
+ __entry->silent = silent;
+ ),
+ TP_printk("%p %p %d", __entry->sb,
+ __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+ TP_PROTO(int is_remount, char *options),
+ TP_ARGS(is_remount, options),
+ TP_STRUCT__entry(
+ __field(int, is_remount)
+ __string(options, options)
+ ),
+ TP_fast_assign(
+ __entry->is_remount = is_remount;
+ __assign_str(options, options);
+ ),
+ TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+ TP_PROTO(void *sb, void *buf),
+ TP_ARGS(sb, buf),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, buf)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->buf = buf;
+ ),
+ TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+ TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+ unsigned long long system_dir, int cluster_bits),
+ TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+ TP_STRUCT__entry(
+ __string(label, label)
+ __string(uuid_str, uuid_str)
+ __field(unsigned long long, root_dir)
+ __field(unsigned long long, system_dir)
+ __field(int, cluster_bits)
+ ),
+ TP_fast_assign(
+ __assign_str(label, label);
+ __assign_str(uuid_str, uuid_str);
+ __entry->root_dir = root_dir;
+ __entry->system_dir = system_dir;
+ __entry->cluster_bits = cluster_bits;
+ ),
+ TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+ __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+ TP_PROTO(const char *name, int meta, int clusters, int credits),
+ TP_ARGS(name, meta, clusters, credits),
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(int, meta)
+ __field(int, clusters)
+ __field(int, credits)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->meta = meta;
+ __entry->clusters = clusters;
+ __entry->credits = credits;
+ ),
+ TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+ __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+ TP_PROTO(unsigned long long ino, const char *name, int name_index,
+ unsigned int hash, unsigned long long location,
+ int xe_index),
+ TP_ARGS(ino, name, name_index, hash, location, xe_index),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __string(name, name)
+ __field(int, name_index)
+ __field(unsigned int, hash)
+ __field(unsigned long long, location)
+ __field(int, xe_index)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __assign_str(name, name);
+ __entry->name_index = name_index;
+ __entry->hash = hash;
+ __entry->location = location;
+ __entry->xe_index = xe_index;
+ ),
+ TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+ __entry->name_index, __entry->hash, __entry->location,
+ __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \
+DEFINE_EVENT(ocfs2__xattr_find, name, \
+TP_PROTO(unsigned long long ino, const char *name, int name_index, \
+ unsigned int hash, unsigned long long bucket, \
+ int xe_index), \
+ TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+ TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+ unsigned int wanted, int empty_root),
+ TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+ TP_STRUCT__entry(
+ __field(unsigned int, r_start)
+ __field(unsigned int, r_end)
+ __field(unsigned int, goal)
+ __field(unsigned int, wanted)
+ __field(int, empty_root)
+ ),
+ TP_fast_assign(
+ __entry->r_start = r_start;
+ __entry->r_end = r_end;
+ __entry->goal = goal;
+ __entry->wanted = wanted;
+ __entry->empty_root = empty_root;
+ ),
+ TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+ __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+ TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(start, end, len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, start)
+ __field(unsigned int, end)
+ __field(unsigned int, len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->len = len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+ __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+ TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+ unsigned int r_start, unsigned int r_end, unsigned int r_len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(cstart, cend, clen, r_start, r_end,
+ r_len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, cstart)
+ __field(unsigned int, cend)
+ __field(unsigned int, clen)
+ __field(unsigned int, r_start)
+ __field(unsigned int, r_end)
+ __field(unsigned int, r_len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->cstart = cstart;
+ __entry->cend = cend;
+ __entry->clen = clen;
+ __entry->r_start = r_start;
+ __entry->r_end = r_end;
+ __entry->r_len = r_len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u %u %u %u",
+ __entry->cstart, __entry->cend, __entry->clen,
+ __entry->r_start, __entry->r_end, __entry->r_len,
+ __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+ TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(start, end, len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, start)
+ __field(unsigned int, end)
+ __field(unsigned int, len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->len = len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+ __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+ TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+ long long spacechange, long long curinodes,
+ long long inodechange),
+ TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+ TP_STRUCT__entry(
+ __field(unsigned int, dq_id)
+ __field(long long, dqb_curspace)
+ __field(long long, spacechange)
+ __field(long long, curinodes)
+ __field(long long, inodechange)
+ ),
+ TP_fast_assign(
+ __entry->dq_id = dq_id;
+ __entry->dqb_curspace = dqb_curspace;
+ __entry->spacechange = spacechange;
+ __entry->curinodes = curinodes;
+ __entry->inodechange = inodechange;
+ ),
+ TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+ __entry->dqb_curspace, __entry->spacechange,
+ __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+ TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+ const char *s_id),
+ TP_ARGS(dq_id, dq_type, type, s_id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, dq_id)
+ __field(unsigned int, dq_type)
+ __field(unsigned long, type)
+ __string(s_id, s_id)
+ ),
+ TP_fast_assign(
+ __entry->dq_id = dq_id;
+ __entry->dq_type = dq_type;
+ __entry->type = type;
+ __assign_str(s_id, s_id);
+ ),
+ TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+ __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+ TP_PROTO(unsigned long long ino, int namelen, const char *name,
+ unsigned int major_hash, unsigned int minor_hash,
+ unsigned long long blkno),
+ TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(int, namelen)
+ __string(name, name)
+ __field(unsigned int, major_hash)
+ __field(unsigned int,minor_hash)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->major_hash = major_hash;
+ __entry->minor_hash = minor_hash;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+ __entry->namelen, __get_str(name),
+ __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+ TP_PROTO(int namelen, const char *name, void *blkno,
+ unsigned long long dir),
+ TP_ARGS(namelen, name, blkno, dir),
+ TP_STRUCT__entry(
+ __field(int, namelen)
+ __string(name, name)
+ __field(void *, blkno)
+ __field(unsigned long long, dir)
+ ),
+ TP_fast_assign(
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->blkno = blkno;
+ __entry->dir = dir;
+ ),
+ TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+ __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+ TP_PROTO(unsigned long long dir, int namelen, const char *name),
+ TP_ARGS(dir, namelen, name),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __field(int, namelen)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ ),
+ TP_printk("%llu %.*s", __entry->dir,
+ __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+ TP_PROTO(unsigned long long dir,
+ unsigned int major_hash, unsigned int minor_hash,
+ int namelen, const char *name, unsigned int num_used),
+ TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __field(unsigned int, major_hash)
+ __field(unsigned int, minor_hash)
+ __field(int, namelen)
+ __string(name, name)
+ __field(unsigned int, num_used)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->major_hash = major_hash;
+ __entry->minor_hash = minor_hash;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->num_used = num_used;
+ ),
+ TP_printk("%llu %x %x %.*s %u", __entry->dir,
+ __entry->major_hash, __entry->minor_hash,
+ __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+ TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+ unsigned long long dir_blkno, unsigned long long extra),
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(unsigned long long, dir_blkno)
+ __field(unsigned long long, extra)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->dir_blkno = dir_blkno;
+ __entry->extra = extra;
+ ),
+ TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+ __entry->name_len, __get_str(name),
+ __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name) \
+DEFINE_EVENT(ocfs2__dentry_ops, name, \
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \
+ unsigned long long dir_blkno, unsigned long long extra), \
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+ TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+ unsigned long long dir_blkno, unsigned long dev, int mode),
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(unsigned long long, dir_blkno)
+ __field(unsigned long, dev)
+ __field(int, mode)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->dir_blkno = dir_blkno;
+ __entry->dev = dev;
+ __entry->mode = mode;
+ ),
+ TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+ __entry->name_len, __get_str(name),
+ __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+ TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+ int name_len, const char *name),
+ TP_ARGS(ino, old_len, old_name, name_len, name),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(int, old_len)
+ __string(old_name, old_name)
+ __field(int, name_len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->old_len = old_len;
+ __assign_str(old_name, old_name);
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%llu %.*s %.*s", __entry->ino,
+ __entry->old_len, __get_str(old_name),
+ __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+ TP_PROTO(void *old_dir, void *old_dentry,
+ void *new_dir, void *new_dentry,
+ int old_len, const char *old_name,
+ int new_len, const char *new_name),
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+ old_len, old_name, new_len, new_name),
+ TP_STRUCT__entry(
+ __field(void *, old_dir)
+ __field(void *, old_dentry)
+ __field(void *, new_dir)
+ __field(void *, new_dentry)
+ __field(int, old_len)
+ __string(old_name, old_name)
+ __field(int, new_len)
+ __string(new_name, new_name)
+ ),
+ TP_fast_assign(
+ __entry->old_dir = old_dir;
+ __entry->old_dentry = old_dentry;
+ __entry->new_dir = new_dir;
+ __entry->new_dentry = new_dentry;
+ __entry->old_len = old_len;
+ __assign_str(old_name, old_name);
+ __entry->new_len = new_len;
+ __assign_str(new_name, new_name);
+ ),
+ TP_printk("%p %p %p %p %.*s %.*s",
+ __entry->old_dir, __entry->old_dentry,
+ __entry->new_dir, __entry->new_dentry,
+ __entry->old_len, __get_str(old_name),
+ __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+ TP_PROTO(int new_len, const char *new_name),
+ TP_ARGS(new_len, new_name),
+ TP_STRUCT__entry(
+ __field(int, new_len)
+ __string(new_name, new_name)
+ ),
+ TP_fast_assign(
+ __entry->new_len = new_len;
+ __assign_str(new_name, new_name);
+ ),
+ TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+ TP_PROTO(unsigned long long new_blkno, void *new_bh,
+ unsigned long long newdi_blkno),
+ TP_ARGS(new_blkno, new_bh, newdi_blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, new_blkno)
+ __field(void *, new_bh)
+ __field(unsigned long long, newdi_blkno)
+ ),
+ TP_fast_assign(
+ __entry->new_blkno = new_blkno;
+ __entry->new_bh = new_bh;
+ __entry->newdi_blkno = newdi_blkno;
+ ),
+ TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+ __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+ TP_PROTO(void *dir, void *dentry, const char *symname,
+ int len, const char *name),
+ TP_ARGS(dir, dentry, symname, len, name),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(const char *, symname)
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->symname = symname;
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+ __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+ TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+ TP_ARGS(blkno, name, namelen),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __string(name, name)
+ __field(int, namelen)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __assign_str(name, name);
+ __entry->namelen = namelen;
+ ),
+ TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+ __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+ TP_PROTO(unsigned long long dir, const char *name, int namelen),
+ TP_ARGS(dir, name, namelen),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __string(name, name)
+ __field(int, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __assign_str(name, name);
+ __entry->namelen = namelen;
+ ),
+ TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+ __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+ TP_PROTO(void *dentry, int len, const char *name),
+ TP_ARGS(dentry, len, name),
+ TP_STRUCT__entry(
+ __field(void *, dentry)
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dentry = dentry;
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+ TP_PROTO(int len, const char *name, unsigned long pgen,
+ unsigned long gen),
+ TP_ARGS(len, name, pgen, gen),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long, pgen)
+ __field(unsigned long, gen)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->pgen = pgen;
+ __entry->gen = gen;
+ ),
+ TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+ __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+ TP_PROTO(int len, const char *name),
+ TP_ARGS(len, name),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+ TP_PROTO(int len, const char *name,
+ unsigned long long parent, void *fsdata),
+ TP_ARGS(len, name, parent, fsdata),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long long, parent)
+ __field(void *, fsdata)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->parent = parent;
+ __entry->fsdata = fsdata;
+ ),
+ TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+ __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+ TP_PROTO(const char *name, unsigned long long parent,
+ unsigned long long ino),
+ TP_ARGS(name, parent, ino),
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long long, parent)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->parent = parent;
+ __entry->ino = ino;
+ ),
+ TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+ TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+ TP_ARGS(sb, handle, blkno),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, handle)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->handle = handle;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+ TP_PROTO(void *child, int len, const char *name,
+ unsigned long long ino),
+ TP_ARGS(child, len, name, ino),
+ TP_STRUCT__entry(
+ __field(void *, child)
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __entry->child = child;
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->ino = ino;
+ ),
+ TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+ __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+ TP_PROTO(void *dentry, int name_len, const char *name,
+ void *fh, int len, int connectable),
+ TP_ARGS(dentry, name_len, name, fh, len, connectable),
+ TP_STRUCT__entry(
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(void *, fh)
+ __field(int, len)
+ __field(int, connectable)
+ ),
+ TP_fast_assign(
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->fh = fh;
+ __entry->len = len;
+ __entry->connectable = connectable;
+ ),
+ TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+ __get_str(name), __entry->fh, __entry->len,
+ __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+ TP_PROTO(int slot, unsigned long long la_ino,
+ unsigned long long tl_ino, void *qrec),
+ TP_ARGS(slot, la_ino, tl_ino, qrec),
+ TP_STRUCT__entry(
+ __field(int, slot)
+ __field(unsigned long long, la_ino)
+ __field(unsigned long long, tl_ino)
+ __field(void *, qrec)
+ ),
+ TP_fast_assign(
+ __entry->slot = slot;
+ __entry->la_ino = la_ino;
+ __entry->tl_ino = tl_ino;
+ __entry->qrec = qrec;
+ ),
+ TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+ __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+ TP_PROTO(int node_num, int osb_node_num, int disable,
+ void *recovery_thread, int map_set),
+ TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+ TP_STRUCT__entry(
+ __field(int, node_num)
+ __field(int, osb_node_num)
+ __field(int,disable)
+ __field(void *, recovery_thread)
+ __field(int,map_set)
+ ),
+ TP_fast_assign(
+ __entry->node_num = node_num;
+ __entry->osb_node_num = osb_node_num;
+ __entry->disable = disable;
+ __entry->recovery_thread = recovery_thread;
+ __entry->map_set = map_set;
+ ),
+ TP_printk("%d %d %d %p %d", __entry->node_num,
+ __entry->osb_node_num, __entry->disable,
+ __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+ TP_PROTO(unsigned long long block, void *ci),
+ TP_ARGS(block, ci),
+ TP_STRUCT__entry(
+ __field(unsigned long long, block)
+ __field(void *, ci)
+ ),
+ TP_fast_assign(
+ __entry->block = block;
+ __entry->ci = ci;
+ ),
+ TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+ TP_PROTO(void *ci, unsigned long long block,
+ unsigned int nr, int flags),
+ TP_ARGS(ci, block, nr, flags),
+ TP_STRUCT__entry(
+ __field(void *, ci)
+ __field(unsigned long long, block)
+ __field(unsigned int, nr)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->ci = ci;
+ __entry->block = block;
+ __entry->nr = nr;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+ __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+ TP_PROTO(int index, void *item),
+ TP_ARGS(index, item),
+ TP_STRUCT__entry(
+ __field(int, index)
+ __field(void *, item)
+ ),
+ TP_fast_assign(
+ __entry->index = index;
+ __entry->item = item;
+ ),
+ TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e0870..f266d67df3c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,20 +17,18 @@
#include "ocfs2.h"
-/* Common stuff */
-/* id number of quota format */
-#define QFMT_OCFS2 3
-
/*
* In-memory structures
*/
struct ocfs2_dquot {
struct dquot dq_dquot; /* Generic VFS dquot */
loff_t dq_local_off; /* Offset in the local quota file */
+ u64 dq_local_phys_blk; /* Physical block carrying quota structure */
struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
s64 dq_origspace; /* Last globally synced space usage */
s64 dq_originodes; /* Last globally synced inode usage */
+ struct llist_node list; /* Member of list of dquots to drop */
};
/* Description of one chunk to recover in memory */
@@ -50,14 +48,14 @@ struct ocfs2_mem_dqinfo {
unsigned int dqi_chunks; /* Number of chunks in local quota file */
unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
unsigned int dqi_syncms; /* How often should we sync with other nodes */
- unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
struct list_head dqi_chunk; /* List of chunks */
struct inode *dqi_gqinode; /* Global quota file inode */
struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
+ u64 dqi_giblk; /* Number of block with global information header */
struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
- struct buffer_head *dqi_ibh; /* Buffer with information header */
+ struct buffer_head *dqi_libh; /* Buffer with local information header */
struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
struct delayed_work dqi_sync_work; /* Work for syncing dquots */
struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
@@ -107,13 +105,15 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
- struct buffer_head **bh);
-
-extern struct dquot_operations ocfs2_quota_operations;
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+ struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
+
+extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
-
#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 1ed0f7c8686..b990a62cff5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,14 +3,15 @@
*/
#include <linux/spinlock.h>
#include <linux/fs.h>
+#include <linux/slab.h>
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/dqblk_qtree.h>
#include <linux/jiffies.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
+#include <linux/llist.h>
-#define MLOG_MASK_PREFIX ML_QUOTA
#include <cluster/masklog.h>
#include "ocfs2_fs.h"
@@ -23,9 +24,45 @@
#include "sysfile.h"
#include "dlmglue.h"
#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
#include "quota.h"
+#include "ocfs2_trace.h"
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ * by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * and ip_alloc_sem of the global quota file (achieved by
+ * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ * its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ * start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * write to gf
+ * -> write to lf
+ * Acquire dquot for the first time:
+ * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ * -> alloc space for gf
+ * -> start_trans -> qinfo_lock -> write to gf
+ * -> ip_alloc_sem of lf -> alloc space for lf
+ * -> write to lf
+ * Release last reference to dquot:
+ * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ * -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ * inode cluster lock of recovered lf
+ * -> read bitmaps -> ip_alloc_sem of lf
+ * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * write to gf
+ */
static void qsync_work_fn(struct work_struct *work);
@@ -59,7 +96,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
struct ocfs2_global_disk_dqblk *d = dp;
struct mem_dqblk *m = &dquot->dq_dqb;
- d->dqb_id = cpu_to_le32(dquot->dq_id);
+ d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
@@ -69,17 +106,21 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
d->dqb_btime = cpu_to_le64(m->dqb_btime);
d->dqb_itime = cpu_to_le64(m->dqb_itime);
+ d->dqb_pad1 = d->dqb_pad2 = 0;
}
static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
{
struct ocfs2_global_disk_dqblk *d = dp;
struct ocfs2_mem_dqinfo *oinfo =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
if (qtree_entry_unused(&oinfo->dqi_gi, dp))
return 0;
- return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+
+ return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
+ le32_to_cpu(d->dqb_id)),
+ dquot->dq_id);
}
struct qtree_fmt_operations ocfs2_global_ops = {
@@ -88,14 +129,12 @@ struct qtree_fmt_operations ocfs2_global_ops = {
.is_id = ocfs2_global_is_id,
};
-static int ocfs2_validate_quota_block(struct super_block *sb,
- struct buffer_head *bh)
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
{
struct ocfs2_disk_dqtrailer *dqt =
ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
- mlog(0, "Validating quota block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -107,45 +146,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
}
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
- struct buffer_head **bh)
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+ struct buffer_head **bhp)
{
- int rc = 0;
- struct buffer_head *tmp = *bh;
+ int rc;
- rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
- ocfs2_validate_quota_block);
+ *bhp = NULL;
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
+ ocfs2_validate_quota_block);
if (rc)
mlog_errno(rc);
-
- /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
- if (!rc && !*bh)
- *bh = tmp;
-
return rc;
}
-static int ocfs2_get_quota_block(struct inode *inode, int block,
- struct buffer_head **bh)
-{
- u64 pblock, pcount;
- int err;
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (err) {
- mlog_errno(err);
- return err;
- }
- *bh = sb_getblk(inode->i_sb, pblock);
- if (!*bh) {
- err = -EIO;
- mlog_errno(err);
- }
- return err;;
-}
-
/* Read data from global quotafile - avoid pagecache and such because we cannot
* afford acquiring the locks... We use quota cluster lock to serialize
* operations. Caller is responsible for acquiring it. */
@@ -160,6 +173,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
int err = 0;
struct buffer_head *bh;
size_t toread, tocopy;
+ u64 pblock = 0, pcount = 0;
if (off > i_size)
return 0;
@@ -168,8 +182,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
toread = len;
while (toread > 0) {
tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+ if (!pcount) {
+ err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+ &pcount, NULL);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ } else {
+ pcount--;
+ pblock++;
+ }
bh = NULL;
- err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
if (err) {
mlog_errno(err);
return err;
@@ -197,6 +222,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
int err = 0, new = 0, ja_type;
struct buffer_head *bh = NULL;
handle_t *handle = journal_current_handle();
+ u64 pblock, pcount;
if (!handle) {
mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -209,32 +235,37 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
}
- mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
- if (gqinode->i_size < off + len) {
- down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
- err = ocfs2_extend_no_holes(gqinode, off + len, off);
- up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
- if (err < 0)
- goto out;
+ if (i_size_read(gqinode) < off + len) {
+ loff_t rounded_end =
+ ocfs2_align_bytes_to_blocks(sb, off + len);
+
+ /* Space is already allocated in ocfs2_acquire_dquot() */
err = ocfs2_simple_size_update(gqinode,
oinfo->dqi_gqi_bh,
- off + len);
+ rounded_end);
if (err < 0)
goto out;
new = 1;
}
+ err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+ if (err) {
+ mlog_errno(err);
+ goto out;
+ }
/* Not rewriting whole block? */
if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
!new) {
- err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
} else {
- err = ocfs2_get_quota_block(gqinode, blk, &bh);
+ bh = sb_getblk(sb, pblock);
+ if (!bh)
+ err = -ENOMEM;
ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
}
if (err) {
mlog_errno(err);
- return err;
+ goto out;
}
lock_buffer(bh);
if (new)
@@ -243,25 +274,22 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
flush_dcache_page(bh->b_page);
set_buffer_uptodate(bh);
unlock_buffer(bh);
- ocfs2_set_buffer_uptodate(gqinode, bh);
- err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
+ err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
+ ja_type);
if (err < 0) {
brelse(bh);
goto out;
}
- err = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
- if (err < 0)
- goto out;
out:
if (err) {
- mutex_unlock(&gqinode->i_mutex);
mlog_errno(err);
return err;
}
gqinode->i_version++;
ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
- mutex_unlock(&gqinode->i_mutex);
return len;
}
@@ -279,11 +307,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
else
WARN_ON(bh != oinfo->dqi_gqi_bh);
spin_unlock(&dq_data_lock);
+ if (ex) {
+ mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+ down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ } else {
+ down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ }
return 0;
}
void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
+ if (ex) {
+ up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+ } else {
+ up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ }
ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
brelse(oinfo->dqi_gqi_bh);
spin_lock(&dq_data_lock);
@@ -301,10 +341,9 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
struct ocfs2_global_disk_dqinfo dinfo;
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ u64 pcount;
int status;
- mlog_entry_void();
-
/* Read global header */
gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
OCFS2_INVALID_SLOT);
@@ -327,9 +366,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
mlog_errno(status);
goto out_err;
}
+
+ status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ &pcount, NULL);
+ if (status < 0)
+ goto out_unlock;
+
+ status = ocfs2_qinfo_lock(oinfo, 0);
+ if (status < 0)
+ goto out_unlock;
status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
sizeof(struct ocfs2_global_disk_dqinfo),
OCFS2_GLOBAL_INFO_OFF);
+ ocfs2_qinfo_unlock(oinfo, 0);
ocfs2_unlock_global_qf(oinfo, 0);
if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -342,7 +391,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
- oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -351,12 +399,15 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
OCFS2_QBLK_RESERVED_SPACE;
oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
- queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
- oinfo->dqi_syncjiff);
+ schedule_delayed_work(&oinfo->dqi_sync_work,
+ msecs_to_jiffies(oinfo->dqi_syncms));
out_err:
- mlog_exit(status);
return status;
+out_unlock:
+ ocfs2_unlock_global_qf(oinfo, 0);
+ mlog_errno(status);
+ goto out_err;
}
/* Write information to global quota file. Expects exlusive lock on quota
@@ -402,44 +453,23 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
return err;
}
-/* Read in information from global quota file and acquire a reference to it.
- * dquot_acquire() has already started the transaction and locked quota file */
-int ocfs2_global_read_dquot(struct dquot *dquot)
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
{
- int err, err2, ex = 0;
- struct ocfs2_mem_dqinfo *info =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
- err = ocfs2_qinfo_lock(info, 0);
- if (err < 0)
- goto out;
- err = qtree_read_dquot(&info->dqi_gi, dquot);
- if (err < 0)
- goto out_qlock;
- OCFS2_DQUOT(dquot)->dq_use_count++;
- OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
- OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
- if (!dquot->dq_off) { /* No real quota entry? */
- /* Upgrade to exclusive lock for allocation */
- err = ocfs2_qinfo_lock(info, 1);
- if (err < 0)
- goto out_qlock;
- ex = 1;
- }
- err = qtree_write_dquot(&info->dqi_gi, dquot);
- if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
- err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
- if (!err)
- err = err2;
- }
-out_qlock:
- if (ex)
- ocfs2_qinfo_unlock(info, 1);
- ocfs2_qinfo_unlock(info, 0);
-out:
- if (err < 0)
- mlog_errno(err);
- return err;
+ /*
+ * We may need to allocate tree blocks and a leaf block but not the
+ * root block
+ */
+ return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+ /* We modify all the allocated blocks, tree root, info block and
+ * the inode */
+ return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
}
/* Sync local information about quota modifications with global quota file.
@@ -449,7 +479,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
{
int err, err2;
struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_global_disk_dqblk dqblk;
s64 spacechange, inodechange;
@@ -478,9 +508,11 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
olditime = dquot->dq_dqb.dqb_itime;
oldbtime = dquot->dq_dqb.dqb_btime;
ocfs2_global_disk2memdqb(dquot, &dqblk);
- mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
- dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
- dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+ trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_dqb.dqb_curspace,
+ (long long)spacechange,
+ dquot->dq_dqb.dqb_curinodes,
+ (long long)inodechange);
if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
dquot->dq_dqb.dqb_curspace += spacechange;
if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
@@ -527,9 +559,9 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
spin_unlock(&dq_data_lock);
err = ocfs2_qinfo_lock(info, freeing);
if (err < 0) {
- mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
- " (type=%d, id=%u)\n", dquot->dq_type,
- (unsigned)dquot->dq_id);
+ mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
+ " (type=%d, id=%u)\n", dquot->dq_id.type,
+ (unsigned)from_kqid(&init_user_ns, dquot->dq_id));
goto out;
}
if (freeing)
@@ -564,9 +596,10 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
struct ocfs2_super *osb = OCFS2_SB(sb);
int status = 0;
- mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
- dquot->dq_type, type, sb->s_id);
- if (type != dquot->dq_type)
+ trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type,
+ type, sb->s_id);
+ if (type != dquot->dq_id.type)
goto out;
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
@@ -580,19 +613,17 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
}
mutex_lock(&sb_dqopt(sb)->dqio_mutex);
status = ocfs2_sync_dquot(dquot);
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
if (status < 0)
mlog_errno(status);
/* We have to write local structure as well... */
- dquot_mark_dquot_dirty(dquot);
- status = dquot_commit(dquot);
+ status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
return status;
}
@@ -604,8 +635,8 @@ static void qsync_work_fn(struct work_struct *work)
struct super_block *sb = oinfo->dqi_gqinode->i_sb;
dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
- queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
- oinfo->dqi_syncjiff);
+ schedule_delayed_work(&oinfo->dqi_sync_work,
+ msecs_to_jiffies(oinfo->dqi_syncms));
}
/*
@@ -618,7 +649,8 @@ static int ocfs2_write_dquot(struct dquot *dquot)
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
- mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+ trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type);
handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
if (IS_ERR(handle)) {
@@ -626,106 +658,205 @@ static int ocfs2_write_dquot(struct dquot *dquot)
mlog_errno(status);
goto out;
}
- status = dquot_commit(dquot);
+ mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ status = ocfs2_local_write_dquot(dquot);
+ mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out:
- mlog_exit(status);
return status;
}
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
{
- struct ocfs2_mem_dqinfo *oinfo;
- int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ /*
+ * We modify tree, leaf block, global info, local chunk header,
+ * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+ * accounts for inode update
+ */
+ return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+ OCFS2_QINFO_WRITE_CREDITS +
+ OCFS2_INODE_UPDATE_CREDITS;
+}
- if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
- return 0;
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dquot_drop_work);
+ struct llist_node *list;
+ struct ocfs2_dquot *odquot, *next_odquot;
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- /* We modify tree, leaf block, global info, local chunk header,
- * global and local inode */
- return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
- 2 * OCFS2_INODE_UPDATE_CREDITS;
+ list = llist_del_all(&osb->dquot_drop_list);
+ llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+ /* Drop the reference we acquired in ocfs2_dquot_release() */
+ dqput(&odquot->dq_dquot);
+ }
}
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
static int ocfs2_release_dquot(struct dquot *dquot)
{
handle_t *handle;
struct ocfs2_mem_dqinfo *oinfo =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
- mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+ trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type);
+ mutex_lock(&dquot->dq_lock);
+ /* Check whether we are not racing with some other dqget() */
+ if (atomic_read(&dquot->dq_count) > 1)
+ goto out;
+ /* Running from downconvert thread? Postpone quota processing to wq */
+ if (current == osb->dc_task) {
+ /*
+ * Grab our own reference to dquot and queue it for delayed
+ * dropping. Quota code rechecks after calling
+ * ->release_dquot() and won't free dquot structure.
+ */
+ dqgrab(dquot);
+ /* First entry on list -> queue work */
+ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+ queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ goto out;
+ }
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(osb,
- ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+ ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
}
- status = dquot_release(dquot);
+
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ status = ocfs2_local_release_dquot(handle, dquot);
+ /*
+ * If we fail here, we cannot do much as global structure is
+ * already released. So just complain...
+ */
+ if (status < 0)
+ mlog_errno(status);
+ /*
+ * Clear dq_off so that we search for the structure in quota file next
+ * time we acquire it. The structure might be deleted and reallocated
+ * elsewhere by another node while our dquot structure is on freelist.
+ */
+ dquot->dq_off = 0;
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
+ mutex_unlock(&dquot->dq_lock);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
- struct ocfs2_mem_dqinfo *oinfo;
- int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
- struct ocfs2_dinode *lfe, *gfe;
-
- if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
- return 0;
-
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
- lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
- /* We can extend local file + global file. In local file we
- * can modify info, chunk header block and dquot block. In
- * global file we can modify info, tree and leaf block */
- return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
- ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
- 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
-}
-
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
static int ocfs2_acquire_dquot(struct dquot *dquot)
{
+ int status = 0, err;
+ int ex = 0;
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int type = dquot->dq_id.type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct inode *gqinode = info->dqi_gqinode;
+ int need_alloc = ocfs2_global_qinit_alloc(sb, type);
handle_t *handle;
- struct ocfs2_mem_dqinfo *oinfo =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
- struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
- int status = 0;
- mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
- /* We need an exclusive lock, because we're going to update use count
- * and instantiate possibly new dquot structure */
- status = ocfs2_lock_global_qf(oinfo, 1);
+ trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ type);
+ mutex_lock(&dquot->dq_lock);
+ /*
+ * We need an exclusive lock, because we're going to update use count
+ * and instantiate possibly new dquot structure
+ */
+ status = ocfs2_lock_global_qf(info, 1);
if (status < 0)
goto out;
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_dq;
+ /*
+ * We always want to read dquot structure from disk because we don't
+ * know what happened with it while it was on freelist.
+ */
+ status = qtree_read_dquot(&info->dqi_gi, dquot);
+ ocfs2_qinfo_unlock(info, 0);
+ if (status < 0)
+ goto out_dq;
+
+ OCFS2_DQUOT(dquot)->dq_use_count++;
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ if (!dquot->dq_off) { /* No real quota entry? */
+ ex = 1;
+ /*
+ * Add blocks to quota file before we start a transaction since
+ * locking allocators ranks above a transaction start
+ */
+ WARN_ON(journal_current_handle());
+ status = ocfs2_extend_no_holes(gqinode, NULL,
+ i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
+ i_size_read(gqinode));
+ if (status < 0)
+ goto out_dq;
+ }
+
handle = ocfs2_start_trans(osb,
- ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+ ocfs2_calc_global_qinit_credits(sb, type));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
- mlog_errno(status);
- goto out_ilock;
+ goto out_dq;
}
- status = dquot_acquire(dquot);
+ status = ocfs2_qinfo_lock(info, ex);
+ if (status < 0)
+ goto out_trans;
+ status = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (ex && info_dirty(sb_dqinfo(sb, type))) {
+ err = __ocfs2_global_write_info(sb, type);
+ if (!status)
+ status = err;
+ }
+ ocfs2_qinfo_unlock(info, ex);
+out_trans:
ocfs2_commit_trans(osb, handle);
-out_ilock:
- ocfs2_unlock_global_qf(oinfo, 1);
+out_dq:
+ ocfs2_unlock_global_qf(info, 1);
+ if (status < 0)
+ goto out;
+
+ status = ocfs2_create_local_dquot(dquot);
+ if (status < 0)
+ goto out;
+ set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out:
- mlog_exit(status);
+ mutex_unlock(&dquot->dq_lock);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -740,13 +871,13 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
int sync = 0;
int status;
struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(sb);
- mlog_entry("id=%u, type=%d", dquot->dq_id, type);
- dquot_mark_dquot_dirty(dquot);
+ trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
+ type);
/* In case user set some limits, sync dquot immediately to global
* quota file so that information propagates quicker */
@@ -769,19 +900,22 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
mlog_errno(status);
goto out_ilock;
}
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
- goto out_trans;
+ goto out_dlock;
}
/* Now write updated local dquot structure */
- status = dquot_commit(dquot);
-out_trans:
+ status = ocfs2_local_write_dquot(dquot);
+out_dlock:
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -792,8 +926,6 @@ static int ocfs2_write_info(struct super_block *sb, int type)
int status = 0;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
- mlog_entry_void();
-
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
@@ -808,7 +940,8 @@ static int ocfs2_write_info(struct super_block *sb, int type)
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -827,15 +960,8 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
kmem_cache_free(ocfs2_dquot_cachep, dquot);
}
-struct dquot_operations ocfs2_quota_operations = {
- .initialize = dquot_initialize,
- .drop = dquot_drop,
- .alloc_space = dquot_alloc_space,
- .alloc_inode = dquot_alloc_inode,
- .free_space = dquot_free_space,
- .free_inode = dquot_free_inode,
- .transfer = dquot_transfer,
- .write_dquot = ocfs2_write_dquot,
+const struct dquot_operations ocfs2_quota_operations = {
+ /* We never make dquot dirty so .write_dquot is never called */
.acquire_dquot = ocfs2_acquire_dquot,
.release_dquot = ocfs2_release_dquot,
.mark_dirty = ocfs2_mark_dquot_dirty,
@@ -843,20 +969,3 @@ struct dquot_operations ocfs2_quota_operations = {
.alloc_dquot = ocfs2_alloc_dquot,
.destroy_dquot = ocfs2_destroy_dquot,
};
-
-int ocfs2_quota_setup(void)
-{
- ocfs2_quota_wq = create_workqueue("o2quot");
- if (!ocfs2_quota_wq)
- return -ENOMEM;
- return 0;
-}
-
-void ocfs2_quota_shutdown(void)
-{
- if (ocfs2_quota_wq) {
- flush_workqueue(ocfs2_quota_wq);
- destroy_workqueue(ocfs2_quota_wq);
- ocfs2_quota_wq = NULL;
- }
-}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 07deec5e972..2001862bf2b 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,11 +3,11 @@
*/
#include <linux/fs.h>
+#include <linux/slab.h>
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/module.h>
-#define MLOG_MASK_PREFIX ML_QUOTA
#include <cluster/masklog.h>
#include "ocfs2_fs.h"
@@ -20,6 +20,9 @@
#include "sysfile.h"
#include "dlmglue.h"
#include "quota.h"
+#include "uptodate.h"
+#include "super.h"
+#include "ocfs2_trace.h"
/* Number of local quota structures per block */
static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,13 +103,14 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
handle_t *handle;
int status;
- handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
return status;
}
- status = ocfs2_journal_access_dq(handle, inode, bh,
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -116,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
lock_buffer(bh);
modify(bh, private);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- ocfs2_commit_trans(OCFS2_SB(sb), handle);
- return status;
- }
+ ocfs2_journal_dirty(handle, bh);
+
status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
if (status < 0) {
mlog_errno(status);
@@ -130,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
return 0;
}
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+ ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested "
+ "to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+ return -EIO;
+ }
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+ ocfs2_validate_quota_block);
+ if (rc)
+ mlog_errno(rc);
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
/* Check whether we understand format of quota files */
static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
{
@@ -371,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
int status = 0;
struct ocfs2_quota_recovery *rec;
- mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
rec = ocfs2_alloc_quota_recovery();
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -442,11 +477,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
struct ocfs2_recovery_chunk *rchunk, *next;
qsize_t spacechange, inodechange;
- mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
-
- status = ocfs2_lock_global_qf(oinfo, 1);
- if (status < 0)
- goto out;
+ trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
chunk = rchunk->rc_chunk;
@@ -459,7 +490,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
break;
}
dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
- for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+ for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
qbh = NULL;
status = ocfs2_read_quota_block(lqinode,
ol_dqblk_block(sb, chunk, bit),
@@ -470,7 +501,9 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
}
dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
ol_dqblk_block_off(sb, chunk, bit));
- dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+ dquot = dqget(sb,
+ make_kqid(&init_user_ns, type,
+ le64_to_cpu(dqblk->dqb_id)));
if (!dquot) {
status = -EIO;
mlog(ML_ERROR, "Failed to get quota structure "
@@ -480,12 +513,18 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
type);
goto out_put_bh;
}
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_put_dquot;
+ }
+
handle = ocfs2_start_trans(OCFS2_SB(sb),
OCFS2_QSYNC_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto out_put_dquot;
+ goto out_drop_lock;
}
mutex_lock(&sb_dqopt(sb)->dqio_mutex);
spin_lock(&dq_data_lock);
@@ -506,23 +545,24 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_commit;
}
/* Release local quota file entry */
- status = ocfs2_journal_access_dq(handle, lqinode,
+ status = ocfs2_journal_access_dq(handle,
+ INODE_CACHE(lqinode),
qbh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
}
lock_buffer(qbh);
- WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
- ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(qbh);
- status = ocfs2_journal_dirty(handle, qbh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, qbh);
out_commit:
mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+ ocfs2_unlock_global_qf(oinfo, 1);
out_put_dquot:
dqput(dquot);
out_put_bh:
@@ -537,11 +577,10 @@ out_put_bh:
if (status < 0)
break;
}
- ocfs2_unlock_global_qf(oinfo, 1);
-out:
if (status < 0)
free_recovery_list(&(rec->r_list[type]));
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -561,12 +600,14 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct inode *lqinode;
unsigned int flags;
- mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
for (type = 0; type < MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
- mlog(0, "Recovering quota in slot %d\n", slot_num);
+ trace_ocfs2_finish_quota_recovery(slot_num);
lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
if (!lqinode) {
status = -ENOENT;
@@ -577,8 +618,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
/* Someone else is holding the lock? Then he must be
* doing the recovery. Just skip the file... */
if (status == -EAGAIN) {
- mlog(ML_NOTICE, "skipping quota recovery for slot %d "
- "because quota file is locked.\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+ "device (%s) for slot %d because quota file is "
+ "locked.\n", osb->dev_str, slot_num);
status = 0;
goto out_put;
} else if (status < 0) {
@@ -608,13 +650,15 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
goto out_bh;
/* Mark quota file as clean if we are recovering quota file of
* some other node. */
- handle = ocfs2_start_trans(osb, 1);
+ handle = ocfs2_start_trans(osb,
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_bh;
}
- status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+ bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -623,9 +667,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
lock_buffer(bh);
ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bh);
out_trans:
ocfs2_commit_trans(osb, handle);
out_bh:
@@ -655,6 +697,9 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
struct ocfs2_quota_recovery *rec;
int locked = 0;
+ /* We don't need the lock and we have to acquire quota file locks
+ * which will later depend on this lock */
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
info->dqi_maxblimit = 0x7fffffffffffffffLL;
info->dqi_maxilimit = 0x7fffffffffffffffLL;
oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -668,7 +713,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
INIT_LIST_HEAD(&oinfo->dqi_chunk);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
- oinfo->dqi_ibh = NULL;
+ oinfo->dqi_libh = NULL;
status = ocfs2_global_read_info(sb, type);
if (status < 0)
@@ -694,7 +739,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
- oinfo->dqi_ibh = bh;
+ oinfo->dqi_libh = bh;
/* We crashed when using local quota file? */
if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -733,6 +778,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
goto out_err;
}
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
return 0;
out_err:
if (oinfo) {
@@ -746,6 +792,7 @@ out_err:
kfree(oinfo);
}
brelse(bh);
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
return -1;
}
@@ -754,7 +801,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
- ->dqi_ibh;
+ ->dqi_libh;
int status;
status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -777,10 +824,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
int mark_clean = 1, len;
int status;
- /* At this point we know there are no more dquots and thus
- * even if there's some sync in the pdflush queue, it won't
- * find any dquots and return without doing anything */
- cancel_delayed_work_sync(&oinfo->dqi_sync_work);
iput(oinfo->dqi_gqinode);
ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -815,7 +858,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
/* Mark local file as clean */
info->dqi_flags |= OLQF_CLEAN;
status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
- oinfo->dqi_ibh,
+ oinfo->dqi_libh,
olq_update_info,
info);
if (status < 0) {
@@ -825,7 +868,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
out:
ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
- brelse(oinfo->dqi_ibh);
+ brelse(oinfo->dqi_libh);
brelse(oinfo->dqi_lqi_bh);
kfree(oinfo);
return 0;
@@ -840,35 +883,36 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+ ol_dqblk_block_offset(sb, od->dq_local_off));
- dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+ dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
+ od->dq_dquot.dq_id));
spin_lock(&dq_data_lock);
dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
od->dq_origspace);
dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
od->dq_originodes);
spin_unlock(&dq_data_lock);
- mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
- od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
- (long long)le64_to_cpu(dqblk->dqb_inodemod));
+ trace_olq_set_dquot(
+ (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+ (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+ from_kqid(&init_user_ns, od->dq_dquot.dq_id));
}
/* Write dquot to local quota file */
-static int ocfs2_local_write_dquot(struct dquot *dquot)
+int ocfs2_local_write_dquot(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
+ struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type];
int status;
- status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
- ol_dqblk_file_block(sb, od->dq_local_off),
- &bh);
+ status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
+ &bh);
if (status) {
mlog_errno(status);
goto out;
}
- status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
- olq_set_dquot, od);
+ status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -908,7 +952,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
* ol_quota_entries_per_block(sb);
}
- found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
/* We failed? */
if (found == len) {
mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -933,19 +977,19 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
struct ocfs2_local_disk_chunk *dchunk;
int status;
handle_t *handle;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh = NULL, *dbh = NULL;
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
- lqinode->i_size + 2 * sb->s_blocksize,
- lqinode->i_size);
+ status = ocfs2_extend_no_holes(lqinode, NULL,
+ i_size_read(lqinode) + 2 * sb->s_blocksize,
+ i_size_read(lqinode));
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
- lqinode->i_size + 2 * sb->s_blocksize);
+ i_size_read(lqinode) + 2 * sb->s_blocksize);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -957,32 +1001,33 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
mlog_errno(status);
goto out;
}
+ /* Local quota info and two new blocks we initialize */
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
- down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ /* Initialize chunk header */
status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
&p_blkno, NULL, NULL);
- up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
if (status < 0) {
mlog_errno(status);
- goto out;
+ goto out_trans;
}
bh = sb_getblk(sb, p_blkno);
if (!bh) {
status = -ENOMEM;
mlog_errno(status);
- goto out;
+ goto out_trans;
}
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
-
- handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto out;
- }
-
- status = ocfs2_journal_access_dq(handle, lqinode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto out_trans;
@@ -992,14 +1037,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
memset(dchunk->dqc_bitmap, 0,
sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
OCFS2_QBLK_RESERVED_SPACE);
- set_buffer_uptodate(bh);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
+
+ /* Initialize new block with structures */
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+ &p_blkno, NULL, NULL);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ dbh = sb_getblk(sb, p_blkno);
+ if (!dbh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_trans;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto out_trans;
}
+ lock_buffer(dbh);
+ memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+ unlock_buffer(dbh);
+ ocfs2_journal_dirty(handle, dbh);
+ /* Update local quotafile info */
oinfo->dqi_blocks += 2;
oinfo->dqi_chunks++;
status = ocfs2_local_write_info(sb, type);
@@ -1024,6 +1090,7 @@ out_trans:
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out:
brelse(bh);
+ brelse(dbh);
kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
return ERR_PTR(status);
}
@@ -1041,6 +1108,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
struct ocfs2_local_disk_chunk *dchunk;
int epb = ol_quota_entries_per_block(sb);
unsigned int chunk_blocks;
+ struct buffer_head *bh;
+ u64 p_blkno;
int status;
handle_t *handle;
@@ -1055,26 +1124,59 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
return ocfs2_local_quota_add_chunk(sb, type, offset);
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
- lqinode->i_size + sb->s_blocksize,
- lqinode->i_size);
+ status = ocfs2_extend_no_holes(lqinode, NULL,
+ i_size_read(lqinode) + sb->s_blocksize,
+ i_size_read(lqinode));
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
- lqinode->i_size + sb->s_blocksize);
+ i_size_read(lqinode) + sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Get buffer from the just added block */
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+ &p_blkno, NULL, NULL);
if (status < 0) {
mlog_errno(status);
goto out;
}
- handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+ bh = sb_getblk(sb, p_blkno);
+ if (!bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
+
+ /* Local quota info, chunk header and the new block we initialize */
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out;
}
- status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+ /* Zero created block */
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ unlock_buffer(bh);
+ ocfs2_journal_dirty(handle, bh);
+
+ /* Update chunk header */
+ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
+ chunk->qc_headerbh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1085,11 +1187,9 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
lock_buffer(chunk->qc_headerbh);
le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
unlock_buffer(chunk->qc_headerbh);
- status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
+ /* Update file header */
oinfo->dqi_blocks++;
status = ocfs2_local_write_info(sb, type);
if (status < 0) {
@@ -1116,31 +1216,41 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
struct ocfs2_local_disk_chunk *dchunk;
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
- ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, -1);
}
/* Create dquot in the local file for given id */
-static int ocfs2_create_local_dquot(struct dquot *dquot)
+int ocfs2_create_local_dquot(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct inode *lqinode = sb_dqopt(sb)->files[type];
struct ocfs2_quota_chunk *chunk;
struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
int offset;
int status;
+ u64 pcount;
+ down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
chunk = ocfs2_find_free_entry(sb, type, &offset);
if (!chunk) {
chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
- if (IS_ERR(chunk))
- return PTR_ERR(chunk);
+ if (IS_ERR(chunk)) {
+ status = PTR_ERR(chunk);
+ goto out;
+ }
} else if (IS_ERR(chunk)) {
- return PTR_ERR(chunk);
+ status = PTR_ERR(chunk);
+ goto out;
}
od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
od->dq_chunk = chunk;
+ status = ocfs2_extent_map_get_blocks(lqinode,
+ ol_dqblk_block(sb, chunk->qc_num, offset),
+ &od->dq_local_phys_blk,
+ &pcount,
+ NULL);
/* Initialize dquot structure on disk */
status = ocfs2_local_write_dquot(dquot);
@@ -1157,57 +1267,25 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
goto out;
}
out:
+ up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
return status;
}
-/* Create entry in local file for dquot, load data from the global file */
-static int ocfs2_local_read_dquot(struct dquot *dquot)
-{
- int status;
-
- mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
-
- status = ocfs2_global_read_dquot(dquot);
- if (status < 0) {
- mlog_errno(status);
- goto out_err;
- }
-
- /* Now create entry in the local quota file */
- status = ocfs2_create_local_dquot(dquot);
- if (status < 0) {
- mlog_errno(status);
- goto out_err;
- }
- mlog_exit(0);
- return 0;
-out_err:
- mlog_exit(status);
- return status;
-}
-
-/* Release dquot structure from local quota file. ocfs2_release_dquot() has
- * already started a transaction and obtained exclusive lock for global
- * quota file. */
-static int ocfs2_local_release_dquot(struct dquot *dquot)
+/*
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and written all changes to global quota file
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
{
int status;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
struct super_block *sb = dquot->dq_sb;
struct ocfs2_local_disk_chunk *dchunk;
int offset;
- handle_t *handle = journal_current_handle();
- BUG_ON(!handle);
- /* First write all local changes to global file */
- status = ocfs2_global_release_dquot(dquot);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
-
- status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+ status = ocfs2_journal_access_dq(handle,
+ INODE_CACHE(sb_dqopt(sb)->files[type]),
od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -1219,31 +1297,20 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
(od->dq_chunk->qc_headerbh->b_data);
/* Mark structure as freed */
lock_buffer(od->dq_chunk->qc_headerbh);
- ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(od->dq_chunk->qc_headerbh);
- status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
- status = 0;
+ ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
out:
- /* Clear the read bit so that next time someone uses this
- * dquot he reads fresh info from disk and allocates local
- * dquot structure */
- clear_bit(DQ_READ_B, &dquot->dq_flags);
return status;
}
-static struct quota_format_ops ocfs2_format_ops = {
+static const struct quota_format_ops ocfs2_format_ops = {
.check_quota_file = ocfs2_local_check_quota_file,
.read_file_info = ocfs2_local_read_info,
.write_file_info = ocfs2_global_write_info,
.free_file_info = ocfs2_local_free_info,
- .read_dqblk = ocfs2_local_read_dquot,
- .commit_dqblk = ocfs2_local_write_dquot,
- .release_dqblk = ocfs2_local_release_dquot,
};
struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 00000000000..636aab69ead
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4476 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sort.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "aops.h"
+#include "xattr.h"
+#include "namei.h"
+#include "ocfs2_trace.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/posix_acl.h>
+
+struct ocfs2_cow_context {
+ struct inode *inode;
+ u32 cow_start;
+ u32 cow_len;
+ struct ocfs2_extent_tree data_et;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ void *cow_object;
+ struct ocfs2_post_refcount *post_refcount;
+ int extra_credits;
+ int (*get_clusters)(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags);
+ int (*cow_duplicate_clusters)(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+};
+
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+ return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)bh->b_data;
+
+ trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return rc;
+ }
+
+
+ if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Refcount block #%llu has an invalid "
+ "rf_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+ u64 rb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+ ocfs2_validate_refcount_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+ struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+ mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+ .co_owner = ocfs2_refcount_cache_owner,
+ .co_get_super = ocfs2_refcount_cache_get_super,
+ .co_cache_lock = ocfs2_refcount_cache_lock,
+ .co_cache_unlock = ocfs2_refcount_cache_unlock,
+ .co_io_lock = ocfs2_refcount_cache_io_lock,
+ .co_io_unlock = ocfs2_refcount_cache_io_unlock,
+};
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+ struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+ struct ocfs2_refcount_tree *tree = NULL;
+
+ while (n) {
+ tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+ if (blkno < tree->rf_blkno)
+ n = n->rb_left;
+ else if (blkno > tree->rf_blkno)
+ n = n->rb_right;
+ else
+ return tree;
+ }
+
+ return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *new)
+{
+ u64 rf_blkno = new->rf_blkno;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+ struct ocfs2_refcount_tree *tmp;
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+ rf_node);
+
+ if (rf_blkno < tmp->rf_blkno)
+ p = &(*p)->rb_left;
+ else if (rf_blkno > tmp->rf_blkno)
+ p = &(*p)->rb_right;
+ else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+ (unsigned long long)rf_blkno);
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->rf_node, parent, p);
+ rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
+{
+ ocfs2_metadata_cache_exit(&tree->rf_ci);
+ ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
+ ocfs2_lock_res_free(&tree->rf_lockres);
+ kfree(tree);
+}
+
+static inline void
+ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree)
+{
+ rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+ if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
+ osb->osb_ref_tree_lru = NULL;
+}
+
+static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree)
+{
+ spin_lock(&osb->osb_lock);
+ ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+ spin_unlock(&osb->osb_lock);
+}
+
+static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+{
+ struct ocfs2_refcount_tree *tree =
+ container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
+
+ ocfs2_free_refcount_tree(tree);
+}
+
+static inline void
+ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
+{
+ kref_get(&tree->rf_getcnt);
+}
+
+static inline void
+ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
+{
+ kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
+}
+
+static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
+ struct super_block *sb)
+{
+ ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
+ mutex_init(&new->rf_io_mutex);
+ new->rf_sb = sb;
+ spin_lock_init(&new->rf_lock);
+}
+
+static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *new,
+ u64 rf_blkno, u32 generation)
+{
+ init_rwsem(&new->rf_sem);
+ ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
+ rf_blkno, generation);
+}
+
+static struct ocfs2_refcount_tree*
+ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
+{
+ struct ocfs2_refcount_tree *new;
+
+ new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+ if (!new)
+ return NULL;
+
+ new->rf_blkno = rf_blkno;
+ kref_init(&new->rf_getcnt);
+ ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+ return new;
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+ struct ocfs2_refcount_tree **ret_tree)
+{
+ int ret = 0;
+ struct ocfs2_refcount_tree *tree, *new = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *ref_rb;
+
+ spin_lock(&osb->osb_lock);
+ if (osb->osb_ref_tree_lru &&
+ osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+ tree = osb->osb_ref_tree_lru;
+ else
+ tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+ if (tree)
+ goto out;
+
+ spin_unlock(&osb->osb_lock);
+
+ new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
+ if (!new) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+ /*
+ * We need the generation to create the refcount tree lock and since
+ * it isn't changed during the tree modification, we are safe here to
+ * read without protection.
+ * We also have to purge the cache after we create the lock since the
+ * refcount block may have the stale data. It can only be trusted when
+ * we hold the refcount lock.
+ */
+ ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_metadata_cache_exit(&new->rf_ci);
+ kfree(new);
+ return ret;
+ }
+
+ ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
+ ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
+ new->rf_generation);
+ ocfs2_metadata_cache_purge(&new->rf_ci);
+
+ spin_lock(&osb->osb_lock);
+ tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+ if (tree)
+ goto out;
+
+ ocfs2_insert_refcount_tree(osb, new);
+
+ tree = new;
+ new = NULL;
+
+out:
+ *ret_tree = tree;
+
+ osb->osb_ref_tree_lru = tree;
+
+ spin_unlock(&osb->osb_lock);
+
+ if (new)
+ ocfs2_free_refcount_tree(new);
+
+ brelse(ref_root_bh);
+ return ret;
+}
+
+static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ *ref_blkno = le64_to_cpu(di->i_refcount_loc);
+ brelse(di_bh);
+out:
+ return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree, int rw)
+{
+ int ret;
+
+ ret = ocfs2_refcount_lock(tree, rw);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rw)
+ down_write(&tree->rf_sem);
+ else
+ down_read(&tree->rf_sem);
+
+out:
+ return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really needs it.
+ *
+ * If the tree has been re-created by other node, it will free the
+ * old one and re-create it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+ u64 ref_blkno, int rw,
+ struct ocfs2_refcount_tree **ret_tree,
+ struct buffer_head **ref_bh)
+{
+ int ret, delete_tree = 0;
+ struct ocfs2_refcount_tree *tree = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+
+again:
+ ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_refcount_tree_get(tree);
+
+ ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_refcount_tree_put(tree);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ ocfs2_unlock_refcount_tree(osb, tree, rw);
+ ocfs2_refcount_tree_put(tree);
+ goto out;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ /*
+ * If the refcount block has been freed and re-created, we may need
+ * to recreate the refcount tree also.
+ *
+ * Here we just remove the tree from the rb-tree, and the last
+ * kref holder will unlock and delete this refcount_tree.
+ * Then we goto "again" and ocfs2_get_refcount_tree will create
+ * the new refcount tree for us.
+ */
+ if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
+ if (!tree->rf_removed) {
+ ocfs2_erase_refcount_tree_from_list(osb, tree);
+ tree->rf_removed = 1;
+ delete_tree = 1;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, tree, rw);
+ /*
+ * We get an extra reference when we create the refcount
+ * tree, so another put will destroy it.
+ */
+ if (delete_tree)
+ ocfs2_refcount_tree_put(tree);
+ brelse(ref_root_bh);
+ ref_root_bh = NULL;
+ goto again;
+ }
+
+ *ret_tree = tree;
+ if (ref_bh) {
+ *ref_bh = ref_root_bh;
+ ref_root_bh = NULL;
+ }
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree, int rw)
+{
+ if (rw)
+ up_write(&tree->rf_sem);
+ else
+ up_read(&tree->rf_sem);
+
+ ocfs2_refcount_unlock(tree, rw);
+ ocfs2_refcount_tree_put(tree);
+}
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
+{
+ struct rb_node *node;
+ struct ocfs2_refcount_tree *tree;
+ struct rb_root *root = &osb->osb_rf_lock_tree;
+
+ while ((node = rb_last(root)) != NULL) {
+ tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+ trace_ocfs2_purge_refcount_trees(
+ (unsigned long long) tree->rf_blkno);
+
+ rb_erase(&tree->rf_node, root);
+ ocfs2_free_refcount_tree(tree);
+ }
+}
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+static int ocfs2_create_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 suballoc_loc, first_blkno;
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+ trace_ocfs2_create_refcount_tree(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &suballoc_bit_start, &num_got,
+ &first_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
+ if (!new_tree) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Initialize ocfs2_refcount_block. */
+ rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ memset(rb, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+ rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+ rb->rf_blkno = cpu_to_le64(first_blkno);
+ rb->rf_count = cpu_to_le32(1);
+ rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+ spin_lock(&osb->osb_lock);
+ rb->rf_generation = osb->s_next_generation++;
+ spin_unlock(&osb->osb_lock);
+
+ ocfs2_journal_dirty(handle, new_bh);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = cpu_to_le64(first_blkno);
+ spin_unlock(&oi->ip_lock);
+
+ trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+ /*
+ * We have to init the tree lock here since it will use
+ * the generation number to create it.
+ */
+ new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
+ ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
+ new_tree->rf_generation);
+
+ spin_lock(&osb->osb_lock);
+ tree = ocfs2_find_refcount_tree(osb, first_blkno);
+
+ /*
+ * We've just created a new refcount tree in this block. If
+ * we found a refcount tree on the ocfs2_super, it must be
+ * one we just deleted. We free the old tree before
+ * inserting the new tree.
+ */
+ BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
+ if (tree)
+ ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+ ocfs2_insert_refcount_tree(osb, new_tree);
+ spin_unlock(&osb->osb_lock);
+ new_tree = NULL;
+ if (tree)
+ ocfs2_refcount_tree_put(tree);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (new_tree) {
+ ocfs2_metadata_cache_exit(&new_tree->rf_ci);
+ kfree(new_tree);
+ }
+
+ brelse(new_bh);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+static int ocfs2_set_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 refcount_loc)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_tree *ref_tree;
+
+ BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ le32_add_cpu(&rb->rf_count, 1);
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = cpu_to_le64(refcount_loc);
+ spin_unlock(&oi->ip_lock);
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+
+ return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+ int ret, delete_tree = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+ struct inode *alloc_inode = NULL;
+ struct buffer_head *alloc_bh = NULL;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
+ u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
+ u16 bit = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+ return 0;
+
+ BUG_ON(!ref_blkno);
+ ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+ /*
+ * If we are the last user, we need to free the block.
+ * So lock the allocator ahead.
+ */
+ if (le32_to_cpu(rb->rf_count) == 1) {
+ blk = le64_to_cpu(rb->rf_blkno);
+ bit = le16_to_cpu(rb->rf_suballoc_bit);
+ if (rb->rf_suballoc_loc)
+ bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+ alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(rb->rf_suballoc_slot));
+ if (!alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ credits += OCFS2_SUBALLOC_FREE;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ di->i_refcount_loc = 0;
+ spin_unlock(&oi->ip_lock);
+ ocfs2_journal_dirty(handle, di_bh);
+
+ le32_add_cpu(&rb->rf_count , -1);
+ ocfs2_journal_dirty(handle, blk_bh);
+
+ if (!rb->rf_count) {
+ delete_tree = 1;
+ ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
+ ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+ alloc_bh, bit, bg_blkno, 1);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ if (alloc_inode) {
+ ocfs2_inode_unlock(alloc_inode, 1);
+ brelse(alloc_bh);
+ }
+out_mutex:
+ if (alloc_inode) {
+ mutex_unlock(&alloc_inode->i_mutex);
+ iput(alloc_inode);
+ }
+out:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ if (delete_tree)
+ ocfs2_refcount_tree_put(ref_tree);
+ brelse(blk_bh);
+
+ return ret;
+}
+
+static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_leaf_bh,
+ u64 cpos, unsigned int len,
+ struct ocfs2_refcount_rec *ret_rec,
+ int *index)
+{
+ int i = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_rec *rec = NULL;
+
+ for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+ rec = &rb->rf_records.rl_recs[i];
+
+ if (le64_to_cpu(rec->r_cpos) +
+ le32_to_cpu(rec->r_clusters) <= cpos)
+ continue;
+ else if (le64_to_cpu(rec->r_cpos) > cpos)
+ break;
+
+ /* ok, cpos fail in this rec. Just return. */
+ if (ret_rec)
+ *ret_rec = *rec;
+ goto out;
+ }
+
+ if (ret_rec) {
+ /* We meet with a hole here, so fake the rec. */
+ ret_rec->r_cpos = cpu_to_le64(cpos);
+ ret_rec->r_refcount = 0;
+ if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+ le64_to_cpu(rec->r_cpos) < cpos + len)
+ ret_rec->r_clusters =
+ cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+ else
+ ret_rec->r_clusters = cpu_to_le32(len);
+ }
+
+out:
+ *index = i;
+}
+
+/*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&oi->ip_xattr_sem);
+ down_write(&oi->ip_alloc_sem);
+
+ if (oi->ip_clusters)
+ goto out;
+
+ if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
+ goto out;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+ ocfs2_has_inline_xattr_value_outside(inode, di))
+ goto out;
+
+ ret = ocfs2_remove_refcount_tree(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+out:
+ up_write(&oi->ip_alloc_sem);
+ up_write(&oi->ip_xattr_sem);
+ return 0;
+}
+
+/*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_extent_block *eb,
+ struct ocfs2_extent_list *el,
+ int index, u32 *cpos_end)
+{
+ int ret, i, subtree_root;
+ u32 cpos;
+ u64 blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_path *left_path = NULL, *right_path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_extent_list *tmp_el;
+
+ if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+ /*
+ * We have a extent rec after index, so just use the e_cpos
+ * of the next extent rec.
+ */
+ *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+ return 0;
+ }
+
+ if (!eb || (eb && !eb->h_next_leaf_blk)) {
+ /*
+ * We are the last extent rec, so any high cpos should
+ * be stored in this leaf refcount block.
+ */
+ *cpos_end = UINT_MAX;
+ return 0;
+ }
+
+ /*
+ * If the extent block isn't the last one, we have to find
+ * the subtree root between this extent block and the next
+ * leaf extent block and get the corresponding e_cpos from
+ * the subroot. Otherwise we may corrupt the b-tree.
+ */
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ left_path = ocfs2_new_path_from_et(&et);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+ ret = ocfs2_find_path(ci, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ right_path = ocfs2_new_path_from_path(left_path);
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, right_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(&et, left_path,
+ right_path);
+
+ tmp_el = left_path->p_node[subtree_root].el;
+ blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+ for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
+ if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+ *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+ break;
+ }
+ }
+
+ BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+ ocfs2_free_path(left_path);
+ ocfs2_free_path(right_path);
+ return ret;
+}
+
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ * and end at a small value between cpos+len and start of the next record.
+ * This fake record has r_refcount = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, unsigned int len,
+ struct ocfs2_refcount_rec *ret_rec,
+ int *index,
+ struct buffer_head **ret_bh)
+{
+ int ret = 0, i, found;
+ u32 low_cpos, uninitialized_var(cpos_end);
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_extent_block *eb = NULL;
+ struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+ ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
+ ret_rec, index);
+ *ret_bh = ref_root_bh;
+ get_bh(ref_root_bh);
+ return 0;
+ }
+
+ el = &rb->rf_list;
+ low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(sb,
+ "refcount tree %llu has non zero tree "
+ "depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ found = 0;
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+ eb, el, i, &cpos_end);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (cpos_end < low_cpos + len)
+ len = cpos_end - low_cpos;
+ }
+
+ ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
+ ret_rec, index);
+ *ret_bh = ref_leaf_bh;
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+ REF_CONTIG_NONE = 0,
+ REF_CONTIG_LEFT,
+ REF_CONTIG_RIGHT,
+ REF_CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_ref_rec_contig
+ ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ if ((rb->rf_records.rl_recs[index].r_refcount ==
+ rb->rf_records.rl_recs[index + 1].r_refcount) &&
+ (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+ le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
+ le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+ return REF_CONTIG_RIGHT;
+
+ return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+ ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+ if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+ ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+ if (index > 0) {
+ enum ocfs2_ref_rec_contig tmp;
+
+ tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+ if (tmp == REF_CONTIG_RIGHT) {
+ if (ret == REF_CONTIG_RIGHT)
+ ret = REF_CONTIG_LEFTRIGHT;
+ else
+ ret = REF_CONTIG_LEFT;
+ }
+ }
+
+ return ret;
+}
+
+static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
+ rb->rf_records.rl_recs[index+1].r_refcount);
+
+ le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+ le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+ if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+ memmove(&rb->rf_records.rl_recs[index + 1],
+ &rb->rf_records.rl_recs[index + 2],
+ sizeof(struct ocfs2_refcount_rec) *
+ (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+ memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
+ 0, sizeof(struct ocfs2_refcount_rec));
+ le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+ int index)
+{
+ enum ocfs2_ref_rec_contig contig =
+ ocfs2_refcount_rec_contig(rb, index);
+
+ if (contig == REF_CONTIG_NONE)
+ return;
+
+ if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+ BUG_ON(index == 0);
+ index--;
+ }
+
+ ocfs2_rotate_refcount_rec_left(rb, index);
+
+ if (contig == REF_CONTIG_LEFTRIGHT)
+ ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
+static int ocfs2_change_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_leaf_bh,
+ int index, int merge, int change)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rl = &rb->rf_records;
+ struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_change_refcount_rec(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ index, le32_to_cpu(rec->r_refcount), change);
+ le32_add_cpu(&rec->r_refcount, change);
+
+ if (!rec->r_refcount) {
+ if (index != le16_to_cpu(rl->rl_used) - 1) {
+ memmove(rec, rec + 1,
+ (le16_to_cpu(rl->rl_used) - index - 1) *
+ sizeof(struct ocfs2_refcount_rec));
+ memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+ 0, sizeof(struct ocfs2_refcount_rec));
+ }
+
+ le16_add_cpu(&rl->rl_used, -1);
+ } else if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+out:
+ return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head **ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 suballoc_loc, blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *new_rb;
+ struct ocfs2_refcount_block *root_rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &suballoc_bit_start, &num_got,
+ &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_bh = sb_getblk(sb, blkno);
+ if (new_bh == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Initialize ocfs2_refcount_block.
+ * It should contain the same information as the old root.
+ * so just memcpy it and change the corresponding field.
+ */
+ memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+ new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ new_rb->rf_blkno = cpu_to_le64(blkno);
+ new_rb->rf_cpos = cpu_to_le32(0);
+ new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+ new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ /* Now change the root. */
+ memset(&root_rb->rf_list, 0, sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_list));
+ root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+ root_rb->rf_clusters = cpu_to_le32(1);
+ root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+ root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+ root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+ root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+ trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+ le16_to_cpu(new_rb->rf_records.rl_used));
+
+ *ref_leaf_bh = new_bh;
+ new_bh = NULL;
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+ struct ocfs2_refcount_rec *next)
+{
+ if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
+ ocfs2_get_ref_rec_low_cpos(next))
+ return 1;
+
+ return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+ const struct ocfs2_refcount_rec *l = a, *r = b;
+ u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+ u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+ if (l_cpos > r_cpos)
+ return 1;
+ if (l_cpos < r_cpos)
+ return -1;
+ return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+ const struct ocfs2_refcount_rec *l = a, *r = b;
+ u64 l_cpos = le64_to_cpu(l->r_cpos);
+ u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+ if (l_cpos > r_cpos)
+ return 1;
+ if (l_cpos < r_cpos)
+ return -1;
+ return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+ struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+ tmp = *l;
+ *l = *r;
+ *r = tmp;
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ * So just try the middle pos first, and we will exit when we find
+ * the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+ u32 *split_pos, int *split_index)
+{
+ int num_used = le16_to_cpu(rl->rl_used);
+ int delta, middle = num_used / 2;
+
+ for (delta = 0; delta < middle; delta++) {
+ /* Let's check delta earlier than middle */
+ if (ocfs2_refcount_rec_no_intersect(
+ &rl->rl_recs[middle - delta - 1],
+ &rl->rl_recs[middle - delta])) {
+ *split_index = middle - delta;
+ break;
+ }
+
+ /* For even counts, don't walk off the end */
+ if ((middle + delta + 1) == num_used)
+ continue;
+
+ /* Now try delta past middle */
+ if (ocfs2_refcount_rec_no_intersect(
+ &rl->rl_recs[middle + delta],
+ &rl->rl_recs[middle + delta + 1])) {
+ *split_index = middle + delta + 1;
+ break;
+ }
+ }
+
+ if (delta >= middle)
+ return -ENOSPC;
+
+ *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+ return 0;
+}
+
+static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
+ struct buffer_head *new_bh,
+ u32 *split_cpos)
+{
+ int split_index = 0, num_moved, ret;
+ u32 cpos = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rl = &rb->rf_records;
+ struct ocfs2_refcount_block *new_rb =
+ (struct ocfs2_refcount_block *)new_bh->b_data;
+ struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+
+ trace_ocfs2_divide_leaf_refcount_block(
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
+
+ /*
+ * XXX: Improvement later.
+ * If we know all the high 32 bit cpos is the same, no need to sort.
+ *
+ * In order to make the whole process safe, we do:
+ * 1. sort the entries by their low 32 bit cpos first so that we can
+ * find the split cpos easily.
+ * 2. call ocfs2_insert_extent to insert the new refcount block.
+ * 3. move the refcount rec to the new block.
+ * 4. sort the entries by their 64 bit cpos.
+ * 5. dirty the new_rb and rb.
+ */
+ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+ ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ new_rb->rf_cpos = cpu_to_le32(cpos);
+
+ /* move refcount records starting from split_index to the new block. */
+ num_moved = le16_to_cpu(rl->rl_used) - split_index;
+ memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+ num_moved * sizeof(struct ocfs2_refcount_rec));
+
+ /*ok, remove the entries we just moved over to the other block. */
+ memset(&rl->rl_recs[split_index], 0,
+ num_moved * sizeof(struct ocfs2_refcount_rec));
+
+ /* change old and new rl_used accordingly. */
+ le16_add_cpu(&rl->rl_used, -num_moved);
+ new_rl->rl_used = cpu_to_le16(num_moved);
+
+ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+ sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+ sizeof(struct ocfs2_refcount_rec),
+ cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+ *split_cpos = cpos;
+ return 0;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got, new_cpos;
+ u64 suballoc_loc, blkno;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *root_rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_refcount_block *new_rb;
+ struct ocfs2_extent_tree ref_et;
+
+ BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &suballoc_bit_start, &num_got,
+ &blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_bh = sb_getblk(sb, blkno);
+ if (new_bh == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Initialize ocfs2_refcount_block. */
+ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+ memset(new_rb, 0, sb->s_blocksize);
+ strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+ new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
+ new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ new_rb->rf_blkno = cpu_to_le64(blkno);
+ new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+ new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+ new_rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+ new_rb->rf_generation = root_rb->rf_generation;
+
+ ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+ trace_ocfs2_new_leaf_refcount_block(
+ (unsigned long long)new_bh->b_blocknr, new_cpos);
+
+ /* Insert the new leaf block with the specific offset cpos. */
+ ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
+ 1, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct buffer_head *expand_bh = NULL;
+
+ if (ref_root_bh == ref_leaf_bh) {
+ /*
+ * the old root bh hasn't been expanded to a b-tree,
+ * so expand it first.
+ */
+ ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+ &expand_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ expand_bh = ref_leaf_bh;
+ get_bh(expand_bh);
+ }
+
+
+ /* Now add a new refcount block into the tree.*/
+ ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+ expand_bh, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(expand_bh);
+ return ret;
+}
+
+/*
+ * Adjust the extent rec in b-tree representing ref_leaf_bh.
+ *
+ * Only called when we have inserted a new refcount rec at index 0
+ * which means ocfs2_extent_rec.e_cpos may need some change.
+ */
+static int ocfs2_adjust_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *rec)
+{
+ int ret = 0, i;
+ u32 new_cpos, old_cpos;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ struct ocfs2_extent_list *el;
+
+ if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
+ goto out;
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ old_cpos = le32_to_cpu(rb->rf_cpos);
+ new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+ if (old_cpos <= new_cpos)
+ goto out;
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(ci, path, old_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * 2 more credits, one for the leaf refcount block, one for
+ * the extent block contains the extent rec.
+ */
+ ret = ocfs2_extend_trans(handle, 2);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* change the leaf extent block first. */
+ el = path_leaf_el(path);
+
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
+ if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
+ break;
+
+ BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
+
+ el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+
+ /* change the r_cpos in the leaf block. */
+ rb->rf_cpos = cpu_to_le32(new_cpos);
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *rec,
+ int index, int merge,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+ struct buffer_head *new_bh = NULL;
+
+ BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+ if (rf_list->rl_used == rf_list->rl_count) {
+ u64 cpos = le64_to_cpu(rec->r_cpos);
+ u32 len = le32_to_cpu(rec->r_clusters);
+
+ ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, NULL, &index,
+ &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ref_leaf_bh = new_bh;
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ rf_list = &rb->rf_records;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (index < le16_to_cpu(rf_list->rl_used))
+ memmove(&rf_list->rl_recs[index + 1],
+ &rf_list->rl_recs[index],
+ (le16_to_cpu(rf_list->rl_used) - index) *
+ sizeof(struct ocfs2_refcount_rec));
+
+ trace_ocfs2_insert_refcount_rec(
+ (unsigned long long)ref_leaf_bh->b_blocknr, index,
+ (unsigned long long)le64_to_cpu(rec->r_cpos),
+ le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
+
+ rf_list->rl_recs[index] = *rec;
+
+ le16_add_cpu(&rf_list->rl_used, 1);
+
+ if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+ if (index == 0) {
+ ret = ocfs2_adjust_refcount_rec(handle, ci,
+ ref_root_bh,
+ ref_leaf_bh, rec);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_refcount_rec *split_rec,
+ int index, int merge,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, recs_need;
+ u32 len;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+ struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+ struct ocfs2_refcount_rec *tail_rec = NULL;
+ struct buffer_head *new_bh = NULL;
+
+ BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+ trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+ le32_to_cpu(orig_rec->r_clusters),
+ le32_to_cpu(orig_rec->r_refcount),
+ le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount));
+
+ /*
+ * If we just need to split the header or tail clusters,
+ * no more recs are needed, just split is OK.
+ * Otherwise we at least need one new recs.
+ */
+ if (!split_rec->r_refcount &&
+ (split_rec->r_cpos == orig_rec->r_cpos ||
+ le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters) ==
+ le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+ recs_need = 0;
+ else
+ recs_need = 1;
+
+ /*
+ * We need one more rec if we split in the middle and the new rec have
+ * some refcount in it.
+ */
+ if (split_rec->r_refcount &&
+ (split_rec->r_cpos != orig_rec->r_cpos &&
+ le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters) !=
+ le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+ recs_need++;
+
+ /* If the leaf block don't have enough record, expand it. */
+ if (le16_to_cpu(rf_list->rl_used) + recs_need >
+ le16_to_cpu(rf_list->rl_count)) {
+ struct ocfs2_refcount_rec tmp_rec;
+ u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+ len = le32_to_cpu(orig_rec->r_clusters);
+ ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We have to re-get it since now cpos may be moved to
+ * another leaf block.
+ */
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &tmp_rec, &index,
+ &new_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ref_leaf_bh = new_bh;
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ rf_list = &rb->rf_records;
+ orig_rec = &rf_list->rl_recs[index];
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We have calculated out how many new records we need and store
+ * in recs_need, so spare enough space first by moving the records
+ * after "index" to the end.
+ */
+ if (index != le16_to_cpu(rf_list->rl_used) - 1)
+ memmove(&rf_list->rl_recs[index + 1 + recs_need],
+ &rf_list->rl_recs[index + 1],
+ (le16_to_cpu(rf_list->rl_used) - index - 1) *
+ sizeof(struct ocfs2_refcount_rec));
+
+ len = (le64_to_cpu(orig_rec->r_cpos) +
+ le32_to_cpu(orig_rec->r_clusters)) -
+ (le64_to_cpu(split_rec->r_cpos) +
+ le32_to_cpu(split_rec->r_clusters));
+
+ /*
+ * If we have "len", the we will split in the tail and move it
+ * to the end of the space we have just spared.
+ */
+ if (len) {
+ tail_rec = &rf_list->rl_recs[index + recs_need];
+
+ memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+ le64_add_cpu(&tail_rec->r_cpos,
+ le32_to_cpu(tail_rec->r_clusters) - len);
+ tail_rec->r_clusters = cpu_to_le32(len);
+ }
+
+ /*
+ * If the split pos isn't the same as the original one, we need to
+ * split in the head.
+ *
+ * Note: We have the chance that split_rec.r_refcount = 0,
+ * recs_need = 0 and len > 0, which means we just cut the head from
+ * the orig_rec and in that case we have done some modification in
+ * orig_rec above, so the check for r_cpos is faked.
+ */
+ if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
+ len = le64_to_cpu(split_rec->r_cpos) -
+ le64_to_cpu(orig_rec->r_cpos);
+ orig_rec->r_clusters = cpu_to_le32(len);
+ index++;
+ }
+
+ le16_add_cpu(&rf_list->rl_used, recs_need);
+
+ if (split_rec->r_refcount) {
+ rf_list->rl_recs[index] = *split_rec;
+ trace_ocfs2_split_refcount_rec_insert(
+ (unsigned long long)ref_leaf_bh->b_blocknr, index,
+ (unsigned long long)le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount));
+
+ if (merge)
+ ocfs2_refcount_rec_merge(rb, index);
+ }
+
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+ brelse(new_bh);
+ return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len, int merge,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0, index;
+ struct buffer_head *ref_leaf_bh = NULL;
+ struct ocfs2_refcount_rec rec;
+ unsigned int set_len = 0;
+
+ trace_ocfs2_increase_refcount_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len);
+
+ while (len) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ set_len = le32_to_cpu(rec.r_clusters);
+
+ /*
+ * Here we may meet with 3 situations:
+ *
+ * 1. If we find an already existing record, and the length
+ * is the same, cool, we just need to increase the r_refcount
+ * and it is OK.
+ * 2. If we find a hole, just insert it with r_refcount = 1.
+ * 3. If we are in the middle of one extent record, split
+ * it.
+ */
+ if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+ set_len <= len) {
+ trace_ocfs2_increase_refcount_change(
+ (unsigned long long)cpos, set_len,
+ le32_to_cpu(rec.r_refcount));
+ ret = ocfs2_change_refcount_rec(handle, ci,
+ ref_leaf_bh, index,
+ merge, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else if (!rec.r_refcount) {
+ rec.r_refcount = cpu_to_le32(1);
+
+ trace_ocfs2_increase_refcount_insert(
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ set_len);
+ ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+ ref_leaf_bh,
+ &rec, index,
+ merge, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ set_len = min((u64)(cpos + len),
+ le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+ rec.r_cpos = cpu_to_le64(cpos);
+ rec.r_clusters = cpu_to_le32(set_len);
+ le32_add_cpu(&rec.r_refcount, 1);
+
+ trace_ocfs2_increase_refcount_split(
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ set_len, le32_to_cpu(rec.r_refcount));
+ ret = ocfs2_split_refcount_rec(handle, ci,
+ ref_root_bh, ref_leaf_bh,
+ &rec, index, merge,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += set_len;
+ len -= set_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_extent_tree et;
+
+ BUG_ON(rb->rf_records.rl_used);
+
+ trace_ocfs2_remove_refcount_extent(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le32_to_cpu(rb->rf_cpos));
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+ ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+ 1, meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_from_cache(ci, ref_leaf_bh);
+
+ /*
+ * add the freed block to the dealloc so that it will be freed
+ * when we run dealloc.
+ */
+ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(rb->rf_suballoc_slot),
+ le64_to_cpu(rb->rf_suballoc_loc),
+ le64_to_cpu(rb->rf_blkno),
+ le16_to_cpu(rb->rf_suballoc_bit));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ le32_add_cpu(&rb->rf_clusters, -1);
+
+ /*
+ * check whether we need to restore the root refcount block if
+ * there is no leaf extent block at atll.
+ */
+ if (!rb->rf_list.l_next_free_rec) {
+ BUG_ON(rb->rf_clusters);
+
+ trace_ocfs2_restore_refcount_block(
+ (unsigned long long)ref_root_bh->b_blocknr);
+
+ rb->rf_flags = 0;
+ rb->rf_parent = 0;
+ rb->rf_cpos = 0;
+ memset(&rb->rf_records, 0, sb->s_blocksize -
+ offsetof(struct ocfs2_refcount_block, rf_records));
+ rb->rf_records.rl_count =
+ cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+ }
+
+ ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+ return ret;
+}
+
+int ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+ cpos, len, 1,
+ meta_ac, dealloc);
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ struct buffer_head *ref_leaf_bh,
+ int index, u64 cpos, unsigned int len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+ struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+ BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+ BUG_ON(cpos + len >
+ le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+ trace_ocfs2_decrease_refcount_rec(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len);
+
+ if (cpos == le64_to_cpu(rec->r_cpos) &&
+ len == le32_to_cpu(rec->r_clusters))
+ ret = ocfs2_change_refcount_rec(handle, ci,
+ ref_leaf_bh, index, 1, -1);
+ else {
+ struct ocfs2_refcount_rec split = *rec;
+ split.r_cpos = cpu_to_le64(cpos);
+ split.r_clusters = cpu_to_le32(len);
+
+ le32_add_cpu(&split.r_refcount, -1);
+
+ ret = ocfs2_split_refcount_rec(handle, ci,
+ ref_root_bh, ref_leaf_bh,
+ &split, index, 1,
+ meta_ac, dealloc);
+ }
+
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Remove the leaf refcount block if it contains no refcount record. */
+ if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+ ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+ ref_leaf_bh, meta_ac,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete)
+{
+ int ret = 0, index = 0;
+ struct ocfs2_refcount_rec rec;
+ unsigned int r_count = 0, r_len;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct buffer_head *ref_leaf_bh = NULL;
+
+ trace_ocfs2_decrease_refcount(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len, delete);
+
+ while (len) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, len, &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ r_count = le32_to_cpu(rec.r_refcount);
+ BUG_ON(r_count == 0);
+ if (!delete)
+ BUG_ON(r_count > 1);
+
+ r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - cpos;
+
+ ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+ ref_leaf_bh, index,
+ cpos, r_len,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
+ ret = ocfs2_cache_cluster_dealloc(dealloc,
+ ocfs2_clusters_to_blocks(sb, cpos),
+ r_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += r_len;
+ len -= r_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+/* Caller must hold refcount tree lock. */
+int ocfs2_decrease_refcount(struct inode *inode,
+ handle_t *handle, u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete)
+{
+ int ret;
+ u64 ref_blkno;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *tree;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
+ cpos, len, meta_ac, dealloc, delete);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle, u32 cpos,
+ u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+
+ trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+ cpos, len, phys);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_change_extent_flag(handle, et, cpos,
+ len, phys, meta_ac, dealloc,
+ OCFS2_EXT_REFCOUNTED, 0);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 start_cpos,
+ u32 clusters,
+ int *meta_add,
+ int *credits)
+{
+ int ret = 0, index, ref_blocks = 0, recs_add = 0;
+ u64 cpos = start_cpos;
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_rec rec;
+ struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+ u32 len;
+
+ while (clusters) {
+ ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+ cpos, clusters, &rec,
+ &index, &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ref_leaf_bh != prev_bh) {
+ /*
+ * Now we encounter a new leaf block, so calculate
+ * whether we need to extend the old leaf.
+ */
+ if (prev_bh) {
+ rb = (struct ocfs2_refcount_block *)
+ prev_bh->b_data;
+
+ if (le16_to_cpu(rb->rf_records.rl_used) +
+ recs_add >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+ }
+
+ recs_add = 0;
+ *credits += 1;
+ brelse(prev_bh);
+ prev_bh = ref_leaf_bh;
+ get_bh(prev_bh);
+ }
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+ trace_ocfs2_calc_refcount_meta_credits_iterate(
+ recs_add, (unsigned long long)cpos, clusters,
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ le32_to_cpu(rec.r_clusters),
+ le32_to_cpu(rec.r_refcount), index);
+
+ len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - cpos;
+ /*
+ * We record all the records which will be inserted to the
+ * same refcount block, so that we can tell exactly whether
+ * we need a new refcount block or not.
+ *
+ * If we will insert a new one, this is easy and only happens
+ * during adding refcounted flag to the extent, so we don't
+ * have a chance of spliting. We just need one record.
+ *
+ * If the refcount rec already exists, that would be a little
+ * complicated. we may have to:
+ * 1) split at the beginning if the start pos isn't aligned.
+ * we need 1 more record in this case.
+ * 2) split int the end if the end pos isn't aligned.
+ * we need 1 more record in this case.
+ * 3) split in the middle because of file system fragmentation.
+ * we need 2 more records in this case(we can't detect this
+ * beforehand, so always think of the worst case).
+ */
+ if (rec.r_refcount) {
+ recs_add += 2;
+ /* Check whether we need a split at the beginning. */
+ if (cpos == start_cpos &&
+ cpos != le64_to_cpu(rec.r_cpos))
+ recs_add++;
+
+ /* Check whether we need a split in the end. */
+ if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters))
+ recs_add++;
+ } else
+ recs_add++;
+
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ clusters -= len;
+ cpos += len;
+ }
+
+ if (prev_bh) {
+ rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
+
+ if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+
+ *credits += 1;
+ }
+
+ if (!ref_blocks)
+ goto out;
+
+ *meta_add += ref_blocks;
+ *credits += ref_blocks;
+
+ /*
+ * So we may need ref_blocks to insert into the tree.
+ * That also means we need to change the b-tree and add that number
+ * of records since we never merge them.
+ * We need one more block for expansion since the new created leaf
+ * block is also full and needs split.
+ */
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+ *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
+ *credits += ocfs2_calc_extend_credits(sb,
+ et.et_root_el);
+ } else {
+ *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ *meta_add += 1;
+ }
+
+out:
+
+ trace_ocfs2_calc_refcount_meta_credits(
+ (unsigned long long)start_cpos, clusters,
+ *meta_add, *credits);
+ brelse(ref_leaf_bh);
+ brelse(prev_bh);
+ return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * contiguous also, so that we can get the number easily.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+ u64 refcount_loc,
+ u64 phys_blkno,
+ u32 clusters,
+ int *credits,
+ int *ref_blocks)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *tree;
+ u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ ret = -EROFS;
+ goto out;
+ }
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+ refcount_loc, &tree);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
+ &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+ &tree->rf_ci,
+ ref_root_bh,
+ start_cpos, clusters,
+ ref_blocks, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
+
+out:
+ brelse(ref_root_bh);
+ return ret;
+}
+
+#define MAX_CONTIG_BYTES 1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+ return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+ return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
+ * find an offset (start + (n * contig_clusters)) that is closest to cpos
+ * while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+ unsigned int start,
+ unsigned int cpos)
+{
+ BUG_ON(start > cpos);
+
+ return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+ unsigned int len)
+{
+ unsigned int padded =
+ (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+ ocfs2_cow_contig_mask(sb);
+
+ /* Did we wrap? */
+ if (padded < len)
+ padded = UINT_MAX;
+
+ return padded;
+}
+
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ * max_cpos is the place where we want to stop CoW intentionally.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ u32 cpos,
+ u32 write_len,
+ u32 max_cpos,
+ u32 *cow_start,
+ u32 *cow_len)
+{
+ int ret = 0;
+ int tree_height = le16_to_cpu(el->l_tree_depth), i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb = NULL;
+ struct ocfs2_extent_rec *rec;
+ unsigned int want_clusters, rec_end = 0;
+ int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+ int leaf_clusters;
+
+ BUG_ON(cpos + write_len > max_cpos);
+
+ if (tree_height > 0) {
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ *cow_len = 0;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+
+ if (ocfs2_is_empty_extent(rec)) {
+ mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+ "index %d\n", inode->i_ino, i);
+ continue;
+ }
+
+ if (le32_to_cpu(rec->e_cpos) +
+ le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+ continue;
+
+ if (*cow_len == 0) {
+ /*
+ * We should find a refcounted record in the
+ * first pass.
+ */
+ BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+ *cow_start = le32_to_cpu(rec->e_cpos);
+ }
+
+ /*
+ * If we encounter a hole, a non-refcounted record or
+ * pass the max_cpos, stop the search.
+ */
+ if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+ (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
+ (max_cpos <= le32_to_cpu(rec->e_cpos)))
+ break;
+
+ leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+ if (rec_end > max_cpos) {
+ rec_end = max_cpos;
+ leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
+ }
+
+ /*
+ * How many clusters do we actually need from
+ * this extent? First we see how many we actually
+ * need to complete the write. If that's smaller
+ * than contig_clusters, we try for contig_clusters.
+ */
+ if (!*cow_len)
+ want_clusters = write_len;
+ else
+ want_clusters = (cpos + write_len) -
+ (*cow_start + *cow_len);
+ if (want_clusters < contig_clusters)
+ want_clusters = contig_clusters;
+
+ /*
+ * If the write does not cover the whole extent, we
+ * need to calculate how we're going to split the extent.
+ * We try to do it on contig_clusters boundaries.
+ *
+ * Any extent smaller than contig_clusters will be
+ * CoWed in its entirety.
+ */
+ if (leaf_clusters <= contig_clusters)
+ *cow_len += leaf_clusters;
+ else if (*cow_len || (*cow_start == cpos)) {
+ /*
+ * This extent needs to be CoW'd from its
+ * beginning, so all we have to do is compute
+ * how many clusters to grab. We align
+ * want_clusters to the edge of contig_clusters
+ * to get better I/O.
+ */
+ want_clusters = ocfs2_cow_align_length(inode->i_sb,
+ want_clusters);
+
+ if (leaf_clusters < want_clusters)
+ *cow_len += leaf_clusters;
+ else
+ *cow_len += want_clusters;
+ } else if ((*cow_start + contig_clusters) >=
+ (cpos + write_len)) {
+ /*
+ * Breaking off contig_clusters at the front
+ * of the extent will cover our write. That's
+ * easy.
+ */
+ *cow_len = contig_clusters;
+ } else if ((rec_end - cpos) <= contig_clusters) {
+ /*
+ * Breaking off contig_clusters at the tail of
+ * this extent will cover cpos.
+ */
+ *cow_start = rec_end - contig_clusters;
+ *cow_len = contig_clusters;
+ } else if ((rec_end - cpos) <= want_clusters) {
+ /*
+ * While we can't fit the entire write in this
+ * extent, we know that the write goes from cpos
+ * to the end of the extent. Break that off.
+ * We try to break it at some multiple of
+ * contig_clusters from the front of the extent.
+ * Failing that (ie, cpos is within
+ * contig_clusters of the front), we'll CoW the
+ * entire extent.
+ */
+ *cow_start = ocfs2_cow_align_start(inode->i_sb,
+ *cow_start, cpos);
+ *cow_len = rec_end - *cow_start;
+ } else {
+ /*
+ * Ok, the entire write lives in the middle of
+ * this extent. Let's try to slice the extent up
+ * nicely. Optimally, our CoW region starts at
+ * m*contig_clusters from the beginning of the
+ * extent and goes for n*contig_clusters,
+ * covering the entire write.
+ */
+ *cow_start = ocfs2_cow_align_start(inode->i_sb,
+ *cow_start, cpos);
+
+ want_clusters = (cpos + write_len) - *cow_start;
+ want_clusters = ocfs2_cow_align_length(inode->i_sb,
+ want_clusters);
+ if (*cow_start + want_clusters <= rec_end)
+ *cow_len = want_clusters;
+ else
+ *cow_len = rec_end - *cow_start;
+ }
+
+ /* Have we covered our entire write yet? */
+ if ((*cow_start + *cow_len) >= (cpos + write_len))
+ break;
+
+ /*
+ * If we reach the end of the extent block and don't get enough
+ * clusters, continue with the next extent block if possible.
+ */
+ if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+ eb && eb->h_next_leaf_blk) {
+ brelse(eb_bh);
+ eb_bh = NULL;
+
+ ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ i = -1;
+ }
+ }
+
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ * more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ * just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+ u32 p_cluster, u32 num_clusters,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int *credits)
+{
+ int ret = 0, meta_add = 0;
+ int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (num_free_extents < num_clusters + 2)
+ meta_add =
+ ocfs2_extend_meta_needed(et->et_root_el);
+
+ *credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
+
+ ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &meta_add, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+ data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+ BUG_ON(buffer_dirty(bh));
+
+ clear_buffer_mapped(bh);
+
+ return 0;
+}
+
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
+{
+ int ret = 0, partial;
+ struct super_block *sb = inode->i_sb;
+ u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+ struct page *page;
+ pgoff_t page_index;
+ unsigned int from, to, readahead_pages;
+ loff_t offset, end, map_end;
+ struct address_space *mapping = inode->i_mapping;
+
+ trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+ new_cluster, new_len);
+
+ readahead_pages =
+ (ocfs2_cow_contig_clusters(sb) <<
+ OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
+ offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+ /*
+ * We only duplicate pages until we reach the page contains i_size - 1.
+ * So trim 'end' to i_size.
+ */
+ if (end > i_size_read(inode))
+ end = i_size_read(inode);
+
+ while (offset < end) {
+ page_index = offset >> PAGE_CACHE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ if (map_end > end)
+ map_end = end;
+
+ /* from, to is the offset within the page. */
+ from = offset & (PAGE_CACHE_SIZE - 1);
+ to = PAGE_CACHE_SIZE;
+ if (map_end & (PAGE_CACHE_SIZE - 1))
+ to = map_end & (PAGE_CACHE_SIZE - 1);
+
+ page = find_or_create_page(mapping, page_index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+ * can't be dirtied before we CoW it out.
+ */
+ if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+ BUG_ON(PageDirty(page));
+
+ if (!PageUptodate(page)) {
+ ret = block_read_full_page(page, ocfs2_get_block);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ lock_page(page);
+ }
+
+ if (page_has_buffers(page)) {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_clear_cow_buffer);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ocfs2_map_and_dirty_page(inode,
+ handle, from, to,
+ page, 0, &new_block);
+ mark_page_accessed(page);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ offset = map_end;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
+{
+ int ret = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_caching_info *ci = INODE_CACHE(inode);
+ int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+ u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+ u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct buffer_head *old_bh = NULL;
+ struct buffer_head *new_bh = NULL;
+
+ trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+ new_cluster, new_len);
+
+ for (i = 0; i < blocks; i++, old_block++, new_block++) {
+ new_bh = sb_getblk(osb->sb, new_block);
+ if (new_bh == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+ ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_journal_access(handle, ci, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+ ocfs2_journal_dirty(handle, new_bh);
+
+ brelse(new_bh);
+ brelse(old_bh);
+ new_bh = NULL;
+ old_bh = NULL;
+ }
+
+ brelse(new_bh);
+ brelse(old_bh);
+ return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 p_cluster, u32 len,
+ unsigned int ext_flags,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ struct ocfs2_extent_rec replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+ u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+ trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+ cpos, len, p_cluster, ext_flags);
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+ p_cluster));
+ replace_rec.e_flags = ext_flags;
+ replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+ path = ocfs2_new_path_from_et(et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(et->et_ci, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, et, path, index,
+ &replace_rec, meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 old,
+ u32 new, u32 len,
+ unsigned int ext_flags)
+{
+ int ret;
+ struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ u64 ino = ocfs2_metadata_cache_owner(ci);
+
+ trace_ocfs2_replace_clusters((unsigned long long)ino,
+ cpos, old, new, len, ext_flags);
+
+ /*If the old clusters is unwritten, no need to duplicate. */
+ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = context->cow_duplicate_clusters(handle, context->inode,
+ cpos, old, new, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
+ cpos, new, len, ext_flags,
+ context->meta_ac, &context->dealloc);
+ if (ret)
+ mlog_errno(ret);
+out:
+ return ret;
+}
+
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters)
+{
+ int ret = 0;
+ loff_t offset, end, map_end;
+ pgoff_t page_index;
+ struct page *page;
+
+ if (ocfs2_should_order_data(inode))
+ return 0;
+
+ offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+ ret = filemap_fdatawrite_range(inode->i_mapping,
+ offset, end - 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ while (offset < end) {
+ page_index = offset >> PAGE_CACHE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ if (map_end > end)
+ map_end = end;
+
+ page = find_or_create_page(inode->i_mapping,
+ page_index, GFP_NOFS);
+ BUG_ON(!page);
+
+ wait_on_page_writeback(page);
+ if (PageError(page)) {
+ ret = -EIO;
+ mlog_errno(ret);
+ } else
+ mark_page_accessed(page);
+
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ offset = map_end;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+ struct ocfs2_cow_context *context,
+ u32 cpos, u32 p_cluster,
+ u32 num_clusters, unsigned int e_flags)
+{
+ int ret, delete, index, credits = 0;
+ u32 new_bit, new_len, orig_num_clusters;
+ unsigned int set_len;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ handle_t *handle;
+ struct buffer_head *ref_leaf_bh = NULL;
+ struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
+ struct ocfs2_refcount_rec rec;
+
+ trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+ num_clusters, e_flags);
+
+ ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+ &context->data_et,
+ ref_ci,
+ context->ref_root_bh,
+ &context->meta_ac,
+ &context->data_ac, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (context->post_refcount)
+ credits += context->post_refcount->credits;
+
+ credits += context->extra_credits;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ orig_num_clusters = num_clusters;
+
+ while (num_clusters) {
+ ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
+ p_cluster, num_clusters,
+ &rec, &index, &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ BUG_ON(!rec.r_refcount);
+ set_len = min((u64)p_cluster + num_clusters,
+ le64_to_cpu(rec.r_cpos) +
+ le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+ /*
+ * There are many different situation here.
+ * 1. If refcount == 1, remove the flag and don't COW.
+ * 2. If refcount > 1, allocate clusters.
+ * Here we may not allocate r_len once at a time, so continue
+ * until we reach num_clusters.
+ */
+ if (le32_to_cpu(rec.r_refcount) == 1) {
+ delete = 0;
+ ret = ocfs2_clear_ext_refcount(handle,
+ &context->data_et,
+ cpos, p_cluster,
+ set_len, e_flags,
+ context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ } else {
+ delete = 1;
+
+ ret = __ocfs2_claim_clusters(handle,
+ context->data_ac,
+ 1, set_len,
+ &new_bit, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_replace_clusters(handle, context,
+ cpos, p_cluster, new_bit,
+ new_len, e_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ set_len = new_len;
+ }
+
+ ret = __ocfs2_decrease_refcount(handle, ref_ci,
+ context->ref_root_bh,
+ p_cluster, set_len,
+ context->meta_ac,
+ &context->dealloc, delete);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ cpos += set_len;
+ p_cluster += set_len;
+ num_clusters -= set_len;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+ }
+
+ /* handle any post_cow action. */
+ if (context->post_refcount && context->post_refcount->func) {
+ ret = context->post_refcount->func(context->inode, handle,
+ context->post_refcount->para);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ if (context->get_clusters == ocfs2_di_get_clusters) {
+ ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
+ orig_num_clusters);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+ brelse(ref_leaf_bh);
+
+ return ret;
+}
+
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
+{
+ int ret = 0;
+ struct inode *inode = context->inode;
+ u32 cow_start = context->cow_start, cow_len = context->cow_len;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+ "tree, but the feature bit is not set in the "
+ "super block.", inode->i_ino);
+ return -EROFS;
+ }
+
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ while (cow_len) {
+ ret = context->get_clusters(context, cow_start, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+ if (cow_len < num_clusters)
+ num_clusters = cow_len;
+
+ ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+ cow_start, p_cluster,
+ num_clusters, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ cow_len -= num_clusters;
+ cow_start += num_clusters;
+ }
+
+ if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+ }
+
+ return ret;
+}
+
+/*
+ * Starting at cpos, try to CoW write_len clusters. Don't CoW
+ * past max_cpos. This will stop when it runs into a hole or an
+ * unrefcounted extent.
+ */
+static int ocfs2_refcount_cow_hunk(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos)
+{
+ int ret;
+ u32 cow_start = 0, cow_len = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_cow_context *context = NULL;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
+ cpos, write_len, max_cpos,
+ &cow_start, &cow_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+ cpos, write_len, max_cpos,
+ cow_start, cow_len);
+
+ BUG_ON(cow_len == 0);
+
+ context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+ if (!context) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->cow_start = cow_start;
+ context->cow_len = cow_len;
+ context->ref_tree = ref_tree;
+ context->ref_root_bh = ref_root_bh;
+ context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+ context->get_clusters = ocfs2_di_get_clusters;
+
+ ocfs2_init_dinode_extent_tree(&context->data_et,
+ INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_replace_cow(context);
+ if (ret)
+ mlog_errno(ret);
+
+ /*
+ * truncate the extent map here since no matter whether we meet with
+ * any error during the action, we shouldn't trust cached extent map
+ * any more.
+ */
+ ocfs2_extent_map_trunc(inode, cow_start);
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ kfree(context);
+ return ret;
+}
+
+/*
+ * CoW any and all clusters between cpos and cpos+write_len.
+ * Don't CoW past max_cpos. If this returns successfully, all
+ * clusters between cpos and cpos+write_len are safe to modify.
+ */
+int ocfs2_refcount_cow(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos)
+{
+ int ret = 0;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+
+ while (write_len) {
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ if (write_len < num_clusters)
+ num_clusters = write_len;
+
+ if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+ ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+ num_clusters, max_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ write_len -= num_clusters;
+ cpos += num_clusters;
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+ u32 v_cluster, u32 *p_cluster,
+ u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ struct inode *inode = context->inode;
+ struct ocfs2_xattr_value_root *xv = context->cow_object;
+
+ return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+ num_clusters, &xv->xr_list,
+ extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int *meta_add, int *credits)
+{
+ int ret = 0, index, ref_blocks = 0;
+ u32 p_cluster, num_clusters;
+ u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_refcount_rec rec;
+ struct buffer_head *ref_leaf_bh = NULL;
+
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &xv->xr_list,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos += num_clusters;
+
+ while (num_clusters) {
+ ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &rec, &index,
+ &ref_leaf_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!rec.r_refcount);
+
+ rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+ /*
+ * We really don't know whether the other clusters is in
+ * this refcount block or not, so just take the worst
+ * case that all the clusters are in this block and each
+ * one will split a refcount rec, so totally we need
+ * clusters * 2 new refcount rec.
+ */
+ if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+ le16_to_cpu(rb->rf_records.rl_count))
+ ref_blocks++;
+
+ *credits += 1;
+ brelse(ref_leaf_bh);
+ ref_leaf_bh = NULL;
+
+ if (num_clusters <= le32_to_cpu(rec.r_clusters))
+ break;
+ else
+ num_clusters -= le32_to_cpu(rec.r_clusters);
+ p_cluster += num_clusters;
+ }
+ }
+
+ *meta_add += ref_blocks;
+ if (!ref_blocks)
+ goto out;
+
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ else {
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+ *credits += ocfs2_calc_extend_credits(inode->i_sb,
+ et.et_root_el);
+ }
+
+out:
+ brelse(ref_leaf_bh);
+ return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_refcount_tree *ref_tree,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 write_len,
+ struct ocfs2_post_refcount *post)
+{
+ int ret;
+ struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_cow_context *context = NULL;
+ u32 cow_start, cow_len;
+
+ BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+ cpos, write_len, UINT_MAX,
+ &cow_start, &cow_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(cow_len == 0);
+
+ context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+ if (!context) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->inode = inode;
+ context->cow_start = cow_start;
+ context->cow_len = cow_len;
+ context->ref_tree = ref_tree;
+ context->ref_root_bh = ref_root_bh;
+ context->cow_object = xv;
+
+ context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+ /* We need the extra credits for duplicate_clusters by jbd. */
+ context->extra_credits =
+ ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
+ context->get_clusters = ocfs2_xattr_value_get_clusters;
+ context->post_refcount = post;
+
+ ocfs2_init_xattr_value_extent_tree(&context->data_et,
+ INODE_CACHE(inode), vb);
+
+ ret = ocfs2_replace_cow(context);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ kfree(context);
+ return ret;
+}
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+int ocfs2_add_refcount_flag(struct inode *inode,
+ struct ocfs2_extent_tree *data_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *post)
+{
+ int ret;
+ handle_t *handle;
+ int credits = 1, ref_blocks = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+ ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ &ref_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_add_refcount_flag(ref_blocks, credits);
+
+ if (ref_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ ref_blocks, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (post)
+ credits += post->credits;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
+ cpos, num_clusters, p_cluster,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+ p_cluster, num_clusters, 0,
+ meta_ac, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (post && post->func) {
+ ret = post->func(inode, handle, post->para);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_change_ctime(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ int ret, data_changed = 0;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_refcount_tree *ref_tree;
+ unsigned int ext_flags;
+ loff_t size;
+ u32 cpos, num_clusters, clusters, p_cluster;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree di_et;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+ ret = ocfs2_create_refcount_tree(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ BUG_ON(!di->i_refcount_loc);
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(di->i_refcount_loc), 1,
+ &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ goto attach_xattr;
+
+ ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
+
+ size = i_size_read(inode);
+ clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(inode, &di_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, cpos,
+ p_cluster, num_clusters,
+ &dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ data_changed = 1;
+ }
+ cpos += num_clusters;
+ }
+
+attach_xattr:
+ if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ if (data_changed) {
+ ret = ocfs2_change_ctime(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+unlock:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+
+ if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+out:
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode, 0);
+
+ return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ unsigned int ext_flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ handle_t *handle;
+ int credits = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+ p_cluster, num_clusters,
+ et, ref_ci,
+ ref_root_bh, &meta_ac,
+ NULL, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_insert_extent(handle, et, cpos,
+ ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
+ num_clusters, ext_flags, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+ p_cluster, num_clusters,
+ meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_duplicate_inline_data(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
+
+ BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
+ memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
+ le16_to_cpu(s_di->id2.i_data.id_count));
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+ t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ u32 p_cluster, num_clusters, clusters, cpos;
+ loff_t size;
+ unsigned int ext_flags;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+ size = i_size_read(s_inode);
+ clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (p_cluster) {
+ ret = ocfs2_add_refcounted_extent(t_inode, &et,
+ ref_ci, ref_root_bh,
+ cpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += num_clusters;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * change the new file's attributes to the src.
+ *
+ * reflink creates a snapshot of a file, that means the attributes
+ * must be identical except for three exceptions - nlink, ino, and ctime.
+ */
+static int ocfs2_complete_reflink(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ bool preserve)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
+ loff_t size = i_size_read(s_inode);
+
+ handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&OCFS2_I(t_inode)->ip_lock);
+ OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+ OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
+ OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
+ spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+ i_size_write(t_inode, size);
+ t_inode->i_blocks = s_inode->i_blocks;
+
+ di->i_xattr_inline_size = s_di->i_xattr_inline_size;
+ di->i_clusters = s_di->i_clusters;
+ di->i_size = s_di->i_size;
+ di->i_dyn_features = s_di->i_dyn_features;
+ di->i_attr = s_di->i_attr;
+
+ if (preserve) {
+ t_inode->i_uid = s_inode->i_uid;
+ t_inode->i_gid = s_inode->i_gid;
+ t_inode->i_mode = s_inode->i_mode;
+ di->i_uid = s_di->i_uid;
+ di->i_gid = s_di->i_gid;
+ di->i_mode = s_di->i_mode;
+
+ /*
+ * update time.
+ * we want mtime to appear identical to the source and
+ * update ctime.
+ */
+ t_inode->i_ctime = CURRENT_TIME;
+
+ di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+
+ t_inode->i_mtime = s_inode->i_mtime;
+ di->i_mtime = s_di->i_mtime;
+ di->i_mtime_nsec = s_di->i_mtime_nsec;
+ }
+
+ ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+ return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ bool preserve)
+{
+ int ret;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+ struct ocfs2_refcount_tree *ref_tree;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(di->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
+ t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+ &ref_tree->rf_ci, ref_root_bh,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ bool preserve)
+{
+ int ret;
+ struct inode *inode = old_dentry->d_inode;
+ struct buffer_head *new_bh = NULL;
+
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = filemap_fdatawrite(inode->i_mapping);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_attach_refcount_tree(inode, old_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+ OI_LS_REFLINK_TARGET);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_create_reflink_node(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret) {
+ mlog_errno(ret);
+ goto inode_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ ret = ocfs2_reflink_xattrs(inode, old_bh,
+ new_inode, new_bh,
+ preserve);
+ if (ret) {
+ mlog_errno(ret);
+ goto inode_unlock;
+ }
+ }
+
+ ret = ocfs2_complete_reflink(inode, old_bh,
+ new_inode, new_bh, preserve);
+ if (ret)
+ mlog_errno(ret);
+
+inode_unlock:
+ ocfs2_inode_unlock(new_inode, 1);
+ brelse(new_bh);
+out_unlock:
+ mutex_unlock(&new_inode->i_mutex);
+out:
+ if (!ret) {
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret)
+ mlog_errno(ret);
+ }
+ return ret;
+}
+
+static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool preserve)
+{
+ int error;
+ struct inode *inode = old_dentry->d_inode;
+ struct buffer_head *old_bh = NULL;
+ struct inode *new_orphan_inode = NULL;
+ struct posix_acl *default_acl, *acl;
+ umode_t mode;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ mode = inode->i_mode;
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_create_inode_in_orphan(dir, mode,
+ &new_orphan_inode);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_rw_lock(inode, 1);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_inode_lock(inode, &old_bh, 1);
+ if (error) {
+ mlog_errno(error);
+ ocfs2_rw_unlock(inode, 1);
+ goto out;
+ }
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ error = __ocfs2_reflink(old_dentry, old_bh,
+ new_orphan_inode, preserve);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
+ brelse(old_bh);
+
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ /* If the security isn't preserved, we need to re-initialize them. */
+ if (!preserve) {
+ error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ &new_dentry->d_name,
+ default_acl, acl);
+ if (error)
+ mlog_errno(error);
+ }
+out:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
+ if (!error) {
+ error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ new_dentry);
+ if (error)
+ mlog_errno(error);
+ }
+
+ if (new_orphan_inode) {
+ /*
+ * We need to open_unlock the inode no matter whether we
+ * succeed or not, so that other nodes can delete it later.
+ */
+ ocfs2_open_unlock(new_orphan_inode);
+ if (error)
+ iput(new_orphan_inode);
+ }
+
+ return error;
+}
+
+/*
+ * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
+ * sys_reflink(). This will go away when vfs_reflink() exists in
+ * fs/namei.c.
+ */
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+ if (child->d_inode)
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/**
+ * ocfs2_vfs_reflink - Create a reference-counted link
+ *
+ * @old_dentry: source dentry + inode
+ * @dir: directory to create the target
+ * @new_dentry: target dentry
+ * @preserve: if true, preserve all file attributes
+ */
+static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool preserve)
+{
+ struct inode *inode = old_dentry->d_inode;
+ int error;
+
+ if (!inode)
+ return -ENOENT;
+
+ error = ocfs2_may_create(dir, new_dentry);
+ if (error)
+ return error;
+
+ if (dir->i_sb != inode->i_sb)
+ return -EXDEV;
+
+ /*
+ * A reflink to an append-only or immutable file cannot be created.
+ */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ /* Only regular files can be reflinked. */
+ if (!S_ISREG(inode->i_mode))
+ return -EPERM;
+
+ /*
+ * If the caller wants to preserve ownership, they require the
+ * rights to do so.
+ */
+ if (preserve) {
+ if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
+ return -EPERM;
+ if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
+ return -EPERM;
+ }
+
+ /*
+ * If the caller is modifying any aspect of the attributes, they
+ * are not creating a snapshot. They need read permission on the
+ * file.
+ */
+ if (!preserve) {
+ error = inode_permission(inode, MAY_READ);
+ if (error)
+ return error;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ dquot_initialize(dir);
+ error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+ mutex_unlock(&inode->i_mutex);
+ if (!error)
+ fsnotify_create(dir, new_dentry);
+ return error;
+}
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink_ioctl(struct inode *inode,
+ const char __user *oldname,
+ const char __user *newname,
+ bool preserve)
+{
+ struct dentry *new_dentry;
+ struct path old_path, new_path;
+ int error;
+
+ if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+ if (error) {
+ mlog_errno(error);
+ return error;
+ }
+
+ new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry)) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = -EXDEV;
+ if (old_path.mnt != new_path.mnt) {
+ mlog_errno(error);
+ goto out_dput;
+ }
+
+ error = ocfs2_vfs_reflink(old_path.dentry,
+ new_path.dentry->d_inode,
+ new_dentry, preserve);
+out_dput:
+ done_path_create(&new_path, new_dentry);
+out:
+ path_put(&old_path);
+
+ return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 00000000000..6422bbcdb52
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,118 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+struct ocfs2_refcount_tree {
+ struct rb_node rf_node;
+ u64 rf_blkno;
+ u32 rf_generation;
+ struct kref rf_getcnt;
+ struct rw_semaphore rf_sem;
+ struct ocfs2_lock_res rf_lockres;
+ int rf_removed;
+
+ /* the following 4 fields are used by caching_info. */
+ spinlock_t rf_lock;
+ struct ocfs2_caching_info rf_ci;
+ struct mutex rf_io_mutex;
+ struct super_block *rf_sb;
+};
+
+void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+ struct ocfs2_refcount_tree **tree,
+ struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+ struct ocfs2_refcount_tree *tree,
+ int rw);
+
+int ocfs2_decrease_refcount(struct inode *inode,
+ handle_t *handle, u32 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int delete);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+ u64 refcount_loc,
+ u64 phys_blkno,
+ u32 clusters,
+ int *credits,
+ int *ref_blocks);
+int ocfs2_refcount_cow(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 write_len, u32 max_cpos);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+ handle_t *handle,
+ void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+ int credits; /* credits it need for journal. */
+ ocfs2_post_refcount_func *func; /* real function. */
+ void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_refcount_tree *ref_tree,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 write_len,
+ struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters);
+int ocfs2_add_refcount_flag(struct inode *inode,
+ struct ocfs2_extent_tree *data_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ u32 cpos, u32 p_cluster, u32 num_clusters,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *post);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+ struct buffer_head *di_bh);
+int ocfs2_increase_refcount(handle_t *handle,
+ struct ocfs2_caching_info *ci,
+ struct buffer_head *ref_root_bh,
+ u64 cpos, u32 len,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_ioctl(struct inode *inode,
+ const char __user *oldname,
+ const char __user *newname,
+ bool preserve);
+#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 00000000000..41ffd36c689
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,839 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_trace.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define OCFS2_MIN_RESV_WINDOW_BITS 8
+#define OCFS2_MAX_RESV_WINDOW_BITS 1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+ return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ unsigned int bits;
+
+ if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+ /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+ bits = 4 << osb->osb_resv_level;
+ } else {
+ bits = 4 << osb->osb_dir_resv_level;
+ }
+ return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_len)
+ return resv->r_start + resv->r_len - 1;
+ return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+ return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+ if (resmap->m_osb->osb_resv_level == 0)
+ return 1;
+ return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+ int i = 0;
+
+ mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+ osb->dev_str, resmap->m_bitmap_len);
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+ "\tlast_len: %u\n", resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ node = rb_next(node);
+ i++;
+ }
+
+ mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+ i = 0;
+ list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+ mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+ "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ i++;
+ }
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+ int i,
+ struct ocfs2_alloc_reservation *resv)
+{
+ char *disk_bitmap = resmap->m_disk_bitmap;
+ unsigned int start = resv->r_start;
+ unsigned int end = ocfs2_resv_end(resv);
+
+ while (start <= end) {
+ if (ocfs2_test_bit(start, disk_bitmap)) {
+ mlog(ML_ERROR,
+ "reservation %d covers an allocated area "
+ "starting at bit %u!\n", i, start);
+ return 1;
+ }
+
+ start++;
+ }
+ return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+ unsigned int off = 0;
+ int i = 0;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (i > 0 && resv->r_start <= off) {
+ mlog(ML_ERROR, "reservation %d has bad start off!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_len == 0) {
+ mlog(ML_ERROR, "reservation %d has no length!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_start > ocfs2_resv_end(resv)) {
+ mlog(ML_ERROR, "reservation %d has invalid range!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+ mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_validate_resmap_bits(resmap, i, resv))
+ goto bad;
+
+ off = ocfs2_resv_end(resv);
+ node = rb_next(node);
+
+ i++;
+ }
+ return;
+
+bad:
+ ocfs2_dump_resv(resmap);
+ BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+ memset(resv, 0, sizeof(*resv));
+ INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags)
+{
+ BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+ resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap)
+{
+ memset(resmap, 0, sizeof(*resmap));
+
+ resmap->m_osb = osb;
+ resmap->m_reservations = RB_ROOT;
+ /* m_bitmap_len is initialized to zero by the above memset. */
+ INIT_LIST_HEAD(&resmap->m_lru);
+
+ return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ if (!list_empty(&resv->r_lru))
+ list_del_init(&resv->r_lru);
+
+ list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+ resv->r_len = 0;
+ resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+ list_del_init(&resv->r_lru);
+ rb_erase(&resv->r_node, &resmap->m_reservations);
+ resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+ }
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ __ocfs2_resv_trunc(resv);
+ /*
+ * last_len and last_start no longer make sense if
+ * we're changing the range of our allocations.
+ */
+ resv->r_last_len = resv->r_last_start = 0;
+
+ ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv) {
+ spin_lock(&resv_lock);
+ __ocfs2_resv_discard(resmap, resv);
+ spin_unlock(&resv_lock);
+ }
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ assert_spin_locked(&resv_lock);
+
+ while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ __ocfs2_resv_discard(resmap, resv);
+ }
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap)
+{
+ if (ocfs2_resmap_disabled(resmap))
+ return;
+
+ spin_lock(&resv_lock);
+
+ ocfs2_resmap_clear_all_resv(resmap);
+ resmap->m_bitmap_len = clen;
+ resmap->m_disk_bitmap = disk_bitmap;
+
+ spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+ /* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *new)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &root->rb_node;
+ struct ocfs2_alloc_reservation *tmp;
+
+ assert_spin_locked(&resv_lock);
+
+ trace_ocfs2_resv_insert(new->r_start, new->r_len);
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+ if (new->r_start < tmp->r_start) {
+ p = &(*p)->rb_left;
+
+ /*
+ * This is a good place to check for
+ * overlapping reservations.
+ */
+ BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+ } else if (new->r_start > ocfs2_resv_end(tmp)) {
+ p = &(*p)->rb_right;
+ } else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate reservation window!\n");
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, root);
+ new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+ ocfs2_resv_mark_lru(resmap, new);
+
+ ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+ struct ocfs2_alloc_reservation *resv = NULL;
+ struct ocfs2_alloc_reservation *prev_resv = NULL;
+ struct rb_node *node = resmap->m_reservations.rb_node;
+
+ assert_spin_locked(&resv_lock);
+
+ if (!node)
+ return NULL;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+ break;
+
+ /* Check if we overshot the reservation just before goal? */
+ if (resv->r_start > goal) {
+ resv = prev_resv;
+ break;
+ }
+
+ prev_resv = resv;
+ node = rb_next(node);
+ }
+
+ return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+ unsigned int wanted,
+ unsigned int search_start,
+ unsigned int search_len,
+ unsigned int *rstart,
+ unsigned int *rlen)
+{
+ void *bitmap = resmap->m_disk_bitmap;
+ unsigned int best_start, best_len = 0;
+ int offset, start, found;
+
+ trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+ wanted, resmap->m_bitmap_len);
+
+ found = best_start = best_len = 0;
+
+ start = search_start;
+ while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+ start)) != -1) {
+ /* Search reached end of the region */
+ if (offset >= (search_start + search_len))
+ break;
+
+ if (offset == start) {
+ /* we found a zero */
+ found++;
+ /* move start to the next bit to test */
+ start++;
+ } else {
+ /* got a zero after some ones */
+ found = 1;
+ start = offset + 1;
+ }
+ if (found > best_len) {
+ best_len = found;
+ best_start = start - found;
+ }
+
+ if (found >= wanted)
+ break;
+ }
+
+ if (best_len == 0)
+ return 0;
+
+ if (best_len >= wanted)
+ best_len = wanted;
+
+ *rlen = best_len;
+ *rstart = best_start;
+
+ trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
+
+ return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int goal, unsigned int wanted)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ unsigned int gap_start, gap_end, gap_len;
+ struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+ struct rb_node *prev, *next;
+ unsigned int cstart, clen;
+ unsigned int best_start = 0, best_len = 0;
+
+ /*
+ * Nasty cases to consider:
+ *
+ * - rbtree is empty
+ * - our window should be first in all reservations
+ * - our window should be last in all reservations
+ * - need to make sure we don't go past end of bitmap
+ */
+ trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+ goal, wanted, RB_EMPTY_ROOT(root));
+
+ assert_spin_locked(&resv_lock);
+
+ if (RB_EMPTY_ROOT(root)) {
+ /*
+ * Easiest case - empty tree. We can just take
+ * whatever window of free bits we want.
+ */
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ resmap->m_bitmap_len - goal,
+ &cstart, &clen);
+
+ /*
+ * This should never happen - the local alloc window
+ * will always have free bits when we're called.
+ */
+ BUG_ON(goal == 0 && clen == 0);
+
+ if (clen == 0)
+ return;
+
+ resv->r_start = cstart;
+ resv->r_len = clen;
+
+ ocfs2_resv_insert(resmap, resv);
+ return;
+ }
+
+ prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+ if (prev_resv == NULL) {
+ /*
+ * A NULL here means that the search code couldn't
+ * find a window that starts before goal.
+ *
+ * However, we can take the first window after goal,
+ * which is also by definition, the leftmost window in
+ * the entire tree. If we can find free bits in the
+ * gap between goal and the LHS window, then the
+ * reservation can safely be placed there.
+ *
+ * Otherwise we fall back to a linear search, checking
+ * the gaps in between windows for a place to
+ * allocate.
+ */
+
+ next = rb_first(root);
+ next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+ r_node);
+
+ /*
+ * The search should never return such a window. (see
+ * comment above
+ */
+ if (next_resv->r_start <= goal) {
+ mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+ goal, next_resv->r_start, next_resv->r_len);
+ ocfs2_dump_resv(resmap);
+ BUG();
+ }
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ next_resv->r_start - goal,
+ &cstart, &clen);
+ if (clen) {
+ best_len = clen;
+ best_start = cstart;
+ if (best_len == wanted)
+ goto out_insert;
+ }
+
+ prev_resv = next_resv;
+ next_resv = NULL;
+ }
+
+ trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+ ocfs2_resv_end(prev_resv));
+
+ prev = &prev_resv->r_node;
+
+ /* Now we do a linear search for a window, starting at 'prev_rsv' */
+ while (1) {
+ next = rb_next(prev);
+ if (next) {
+ next_resv = rb_entry(next,
+ struct ocfs2_alloc_reservation,
+ r_node);
+
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_end = next_resv->r_start - 1;
+ gap_len = gap_end - gap_start + 1;
+ } else {
+ /*
+ * We're at the rightmost edge of the
+ * tree. See if a reservation between this
+ * window and the end of the bitmap will work.
+ */
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_len = resmap->m_bitmap_len - gap_start;
+ gap_end = resmap->m_bitmap_len - 1;
+ }
+
+ trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+ next ? ocfs2_resv_end(next_resv) : -1);
+ /*
+ * No need to check this gap if we have already found
+ * a larger region of free bits.
+ */
+ if (gap_len <= best_len)
+ goto next_resv;
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+ gap_len, &cstart, &clen);
+ if (clen == wanted) {
+ best_len = clen;
+ best_start = cstart;
+ goto out_insert;
+ } else if (clen > best_len) {
+ best_len = clen;
+ best_start = cstart;
+ }
+
+next_resv:
+ if (!next)
+ break;
+
+ prev = next;
+ prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+ r_node);
+ }
+
+out_insert:
+ if (best_len) {
+ resv->r_start = best_start;
+ resv->r_len = best_len;
+ ocfs2_resv_insert(resmap, resv);
+ }
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ struct ocfs2_alloc_reservation *lru_resv;
+ int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+ unsigned int min_bits;
+
+ if (!tmpwindow)
+ min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+ else
+ min_bits = wanted; /* We at know the temp window will use all
+ * of these bits */
+
+ /*
+ * Take the first reservation off the LRU as our 'target'. We
+ * don't try to be smart about it. There might be a case for
+ * searching based on size but I don't have enough data to be
+ * sure. --Mark (3/16/2010)
+ */
+ lru_resv = list_first_entry(&resmap->m_lru,
+ struct ocfs2_alloc_reservation, r_lru);
+
+ trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+ lru_resv->r_len,
+ ocfs2_resv_end(lru_resv));
+
+ /*
+ * Cannibalize (some or all) of the target reservation and
+ * feed it to the current window.
+ */
+ if (lru_resv->r_len <= min_bits) {
+ /*
+ * Discard completely if size is less than or equal to a
+ * reasonable threshold - 50% of window bits for non temporary
+ * windows.
+ */
+ resv->r_start = lru_resv->r_start;
+ resv->r_len = lru_resv->r_len;
+
+ __ocfs2_resv_discard(resmap, lru_resv);
+ } else {
+ unsigned int shrink;
+ if (tmpwindow)
+ shrink = min_bits;
+ else
+ shrink = lru_resv->r_len / 2;
+
+ lru_resv->r_len -= shrink;
+
+ resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+ resv->r_len = shrink;
+ }
+
+ trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ unsigned int goal = 0;
+
+ BUG_ON(!ocfs2_resv_empty(resv));
+
+ /*
+ * Begin by trying to get a window as close to the previous
+ * one as possible. Using the most recent allocation as a
+ * start goal makes sense.
+ */
+ if (resv->r_last_len) {
+ goal = resv->r_last_start + resv->r_last_len;
+ if (goal >= resmap->m_bitmap_len)
+ goal = 0;
+ }
+
+ __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+ /* Search from last alloc didn't work, try once more from beginning. */
+ if (ocfs2_resv_empty(resv) && goal != 0)
+ __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * Still empty? Pull oldest one off the LRU, remove it from
+ * tree, put this one in it's place.
+ */
+ ocfs2_cannibalize_resv(resmap, resv, wanted);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen)
+{
+ if (resv == NULL || ocfs2_resmap_disabled(resmap))
+ return -ENOSPC;
+
+ spin_lock(&resv_lock);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * We don't want to over-allocate for temporary
+ * windows. Otherwise, we run the risk of fragmenting the
+ * allocation space.
+ */
+ unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+
+ if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+ wanted = *clen;
+
+ /*
+ * Try to get a window here. If it works, we must fall
+ * through and test the bitmap . This avoids some
+ * ping-ponging of windows due to non-reserved space
+ * being allocation before we initialize a window for
+ * that inode.
+ */
+ ocfs2_resv_find_window(resmap, resv, wanted);
+ trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+
+ *cstart = resv->r_start;
+ *clen = resv->r_len;
+
+ spin_unlock(&resv_lock);
+ return 0;
+}
+
+static void
+ ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int start, unsigned int end)
+{
+ unsigned int rhs = 0;
+ unsigned int old_end = ocfs2_resv_end(resv);
+
+ BUG_ON(start != resv->r_start || old_end < end);
+
+ /*
+ * Completely used? We can remove it then.
+ */
+ if (old_end == end) {
+ __ocfs2_resv_discard(resmap, resv);
+ return;
+ }
+
+ rhs = old_end - end;
+
+ /*
+ * This should have been trapped above.
+ */
+ BUG_ON(rhs == 0);
+
+ resv->r_start = end + 1;
+ resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen)
+{
+ unsigned int cend = cstart + clen - 1;
+
+ if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+ return;
+
+ if (resv == NULL)
+ return;
+
+ BUG_ON(cstart != resv->r_start);
+
+ spin_lock(&resv_lock);
+
+ trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len,
+ resv->r_last_start,
+ resv->r_last_len);
+
+ BUG_ON(cstart < resv->r_start);
+ BUG_ON(cstart > ocfs2_resv_end(resv));
+ BUG_ON(cend > ocfs2_resv_end(resv));
+
+ ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+ resv->r_last_start = cstart;
+ resv->r_last_len = clen;
+
+ /*
+ * May have been discarded above from
+ * ocfs2_adjust_resv_from_alloc().
+ */
+ if (!ocfs2_resv_empty(resv))
+ ocfs2_resv_mark_lru(resmap, resv);
+
+ trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ ocfs2_check_resmap(resmap);
+
+ spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 00000000000..42c2b804f3f
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL 2
+#define OCFS2_MAX_RESV_LEVEL 9
+#define OCFS2_MIN_RESV_LEVEL 0
+
+struct ocfs2_alloc_reservation {
+ struct rb_node r_node;
+
+ unsigned int r_start; /* Beginning of current window */
+ unsigned int r_len; /* Length of the window */
+
+ unsigned int r_last_len; /* Length of most recent alloc */
+ unsigned int r_last_start; /* Start of most recent alloc */
+ struct list_head r_lru; /* LRU list head */
+
+ unsigned int r_flags;
+};
+
+#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
+ * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
+ * directory btree */
+
+struct ocfs2_reservation_map {
+ struct rb_root m_reservations;
+ char *m_disk_bitmap;
+
+ struct ocfs2_super *m_osb;
+
+ /* The following are not initialized to meaningful values until a disk
+ * bitmap is provided. */
+ u32 m_bitmap_len; /* Number of valid
+ * bits available */
+
+ struct list_head m_lru; /* LRU of reservations
+ * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen);
+
+#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 424adaa5f90..d5da6f62414 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -27,7 +27,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -39,6 +38,7 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
#include "suballoc.h"
@@ -53,8 +53,6 @@
*/
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
- int new_clusters,
- u32 first_new_cluster,
u16 cl_cpg,
int set)
{
@@ -82,7 +80,6 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
backups++;
}
- mlog_exit_void();
return backups;
}
@@ -103,11 +100,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
- mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
- new_clusters, first_new_cluster);
+ trace_ocfs2_update_last_group_and_inode(new_clusters,
+ first_new_cluster);
- ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
+ group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -128,20 +125,14 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
cl_cpg, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
- ret = ocfs2_journal_dirty(handle, group_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, group_bh);
/* update the inode accordingly. */
- ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -162,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
spin_lock(&OCFS2_I(bm_inode)->ip_lock);
OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
i_size_write(bm_inode, le64_to_cpu(fe->i_size));
@@ -172,15 +163,14 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
cl_cpg, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
}
out:
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
@@ -285,8 +275,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
u32 first_new_cluster;
u64 lgd_blkno;
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
@@ -319,7 +307,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
- ocfs2_group_bitmap_size(osb->sb) * 8) {
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
"Force to do offline resize.");
ret = -EINVAL;
@@ -345,7 +334,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out_unlock;
}
- mlog(0, "extend the last group at %llu, new clusters = %d\n",
+
+ trace_ocfs2_group_extend(
(unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
@@ -380,7 +370,6 @@ out_mutex:
iput(main_bm_inode);
out:
- mlog_exit_void();
return ret;
}
@@ -474,8 +463,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *cr;
u16 cl_bpc;
-
- mlog_entry_void();
+ u64 bg_ptr;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
@@ -500,7 +488,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
- ocfs2_group_bitmap_size(osb->sb) * 8) {
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small."
" Force to do offline resize.");
ret = -EINVAL;
@@ -514,47 +503,44 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_unlock;
}
- ocfs2_set_new_buffer_uptodate(inode, group_bh);
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
mlog_errno(ret);
- goto out_unlock;
+ goto out_free_group_bh;
}
- mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
- (unsigned long long)input->group, input->chain, input->clusters);
+ trace_ocfs2_group_add((unsigned long long)input->group,
+ input->chain, input->clusters, input->frees);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
- goto out_unlock;
+ goto out_free_group_bh;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
cl = &fe->id2.i_chain;
cr = &cl->cl_recs[input->chain];
- ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
+ group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ bg_ptr = le64_to_cpu(group->bg_next_group);
group->bg_next_group = cr->c_blkno;
+ ocfs2_journal_dirty(handle, group_bh);
- ret = ocfs2_journal_dirty(handle, group_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
+ main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
+ group->bg_next_group = cpu_to_le64(bg_ptr);
mlog_errno(ret);
goto out_commit;
}
@@ -577,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
@@ -585,8 +571,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
out_commit:
ocfs2_commit_trans(osb, handle);
-out_unlock:
+
+out_free_group_bh:
brelse(group_bh);
+
+out_unlock:
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
@@ -596,6 +585,5 @@ out_mutex:
iput(main_bm_inode);
out:
- mlog_exit_void();
return ret;
}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 40661e7824e..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -27,7 +27,6 @@
#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -39,6 +38,7 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -142,16 +142,15 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
BUG_ON(si->si_blocks == 0);
BUG_ON(si->si_bh == NULL);
- mlog(0, "Refreshing slot map, reading %u block(s)\n",
- si->si_blocks);
+ trace_ocfs2_refresh_slot_info(si->si_blocks);
/*
* We pass -1 as blocknr because we expect all of si->si_bh to
* be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
- ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
- OCFS2_BH_IGNORE_CACHE, NULL);
+ ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
+ si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -213,7 +212,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
ocfs2_update_disk_slot_old(si, slot_num, &bh);
spin_unlock(&osb->osb_lock);
- status = ocfs2_write_block(osb, bh, si->si_inode);
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
if (status < 0)
mlog_errno(status);
@@ -357,7 +356,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
{
int status = 0;
u64 blkno;
- unsigned long long blocks, bytes;
+ unsigned long long blocks, bytes = 0;
unsigned int i;
struct buffer_head *bh;
@@ -381,8 +380,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
/* The size checks above should ensure this */
BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
- mlog(0, "Slot map needs %u buffers for %llu bytes\n",
- si->si_blocks, bytes);
+ trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
GFP_KERNEL);
@@ -400,12 +398,11 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
goto bail;
}
- mlog(0, "Reading slot map block %u at %llu\n", i,
- (unsigned long long)blkno);
+ trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
bh = NULL; /* Acquire a fresh bh */
- status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
- OCFS2_BH_IGNORE_CACHE, NULL);
+ status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
+ 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -475,8 +472,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
int slot;
struct ocfs2_slot_info *si;
- mlog_entry_void();
-
si = osb->slot_info;
spin_lock(&osb->osb_lock);
@@ -498,21 +493,20 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
goto bail;
}
} else
- mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
- slot);
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
spin_unlock(&osb->osb_lock);
- mlog(0, "taking node slot %d\n", osb->slot_num);
+ trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0)
mlog_errno(status);
bail:
- mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index fcd120f1493..1724d43d3da 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,7 +17,9 @@
* General Public License for more details.
*/
+#include <linux/kernel.h>
#include <linux/crc32.h>
+#include <linux/slab.h>
#include <linux/module.h>
/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -26,6 +28,7 @@
#include "cluster/masklog.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
#include "stackglue.h"
@@ -153,31 +156,30 @@ static int status_map[] = {
static int dlm_status_to_errno(enum dlm_status status)
{
- BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+ BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
return status_map[status];
}
static void o2dlm_lock_ast_wrapper(void *astarg)
{
- BUG_ON(o2cb_stack.sp_proto == NULL);
+ struct ocfs2_dlm_lksb *lksb = astarg;
- o2cb_stack.sp_proto->lp_lock_ast(astarg);
+ lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
}
static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
{
- BUG_ON(o2cb_stack.sp_proto == NULL);
+ struct ocfs2_dlm_lksb *lksb = astarg;
- o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+ lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
}
static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
{
+ struct ocfs2_dlm_lksb *lksb = astarg;
int error = dlm_status_to_errno(status);
- BUG_ON(o2cb_stack.sp_proto == NULL);
-
/*
* In o2dlm, you can get both the lock_ast() for the lock being
* granted and the unlock_ast() for the CANCEL failing. A
@@ -192,16 +194,15 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
if (status == DLM_CANCELGRANT)
return;
- o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+ lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
}
static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
- union ocfs2_dlm_lksb *lksb,
+ struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
- unsigned int namelen,
- void *astarg)
+ unsigned int namelen)
{
enum dlm_status status;
int o2dlm_mode = mode_to_o2dlm(mode);
@@ -210,43 +211,107 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
o2dlm_flags, name, namelen,
- o2dlm_lock_ast_wrapper, astarg,
+ o2dlm_lock_ast_wrapper, lksb,
o2dlm_blocking_ast_wrapper);
ret = dlm_status_to_errno(status);
return ret;
}
static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
- union ocfs2_dlm_lksb *lksb,
- u32 flags,
- void *astarg)
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags)
{
enum dlm_status status;
int o2dlm_flags = flags_to_o2dlm(flags);
int ret;
status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
- o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+ o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
ret = dlm_status_to_errno(status);
return ret;
}
-static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
{
return dlm_status_to_errno(lksb->lksb_o2dlm.status);
}
-static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+/*
+ * o2dlm aways has a "valid" LVB. If the dlm loses track of the LVB
+ * contents, it will zero out the LVB. Thus the caller can always trust
+ * the contents.
+ */
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+ return 1;
+}
+
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
{
return (void *)(lksb->lksb_o2dlm.lvb);
}
-static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
{
dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
}
/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+ u8 node_num;
+ int i;
+ unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_MAX_NODES) {
+ printk(KERN_ERR "o2cb: This node has not been configured.\n");
+ return -EINVAL;
+ }
+
+ /*
+ * o2dlm expects o2net sockets to be created. If not, then
+ * dlm_join_domain() fails with a stack of errors which are both cryptic
+ * and incomplete. The idea here is to detect upfront whether we have
+ * managed to connect to all nodes or not. If not, then list the nodes
+ * to allow the user to check the configuration (incorrect IP, firewall,
+ * etc.) Yes, this is racy. But its not the end of the world.
+ */
+#define O2CB_MAP_STABILIZE_COUNT 60
+ for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+ o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ if (!test_bit(node_num, hbmap)) {
+ printk(KERN_ERR "o2cb: %s heartbeat has not been "
+ "started.\n", (o2hb_global_heartbeat_active() ?
+ "Global" : "Local"));
+ return -EINVAL;
+ }
+ o2net_fill_node_map(netmap, sizeof(netmap));
+ /* Force set the current node to allow easy compare */
+ set_bit(node_num, netmap);
+ if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ return 0;
+ if (i < O2CB_MAP_STABILIZE_COUNT)
+ msleep(1000);
+ }
+
+ printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+ i = -1;
+ while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (!test_bit(i, netmap))
+ printk(" %u", i);
+ }
+ printk(".\n");
+
+ return -ENOTCONN;
+}
+
+/*
* Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death.
*/
@@ -254,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
{
struct ocfs2_cluster_connection *conn = data;
- mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
- node_num, conn->cc_namelen, conn->cc_name);
+ printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
}
@@ -266,15 +331,16 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
u32 dlm_key;
struct dlm_ctxt *dlm;
struct o2dlm_private *priv;
- struct dlm_protocol_version dlm_version;
+ struct dlm_protocol_version fs_version;
BUG_ON(conn == NULL);
- BUG_ON(o2cb_stack.sp_proto == NULL);
+ BUG_ON(conn->cc_proto == NULL);
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (!o2hb_check_local_node_heartbeating()) {
- rc = -EINVAL;
+ /* Ensure cluster stack is up and all nodes are connected */
+ rc = o2cb_cluster_check();
+ if (rc) {
+ printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+ "before retrying.\n");
goto out;
}
@@ -293,24 +359,24 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
/* used by the dlm code to make message headers unique, each
* node in this domain must agree on this. */
dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
- dlm_version.pv_major = conn->cc_version.pv_major;
- dlm_version.pv_minor = conn->cc_version.pv_minor;
+ fs_version.pv_major = conn->cc_version.pv_major;
+ fs_version.pv_minor = conn->cc_version.pv_minor;
- dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+ dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version);
if (IS_ERR(dlm)) {
rc = PTR_ERR(dlm);
mlog_errno(rc);
goto out_free;
}
- conn->cc_version.pv_major = dlm_version.pv_major;
- conn->cc_version.pv_minor = dlm_version.pv_minor;
+ conn->cc_version.pv_major = fs_version.pv_major;
+ conn->cc_version.pv_minor = fs_version.pv_minor;
conn->cc_lockspace = dlm;
dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
out_free:
- if (rc && conn->cc_private)
+ if (rc)
kfree(conn->cc_private);
out:
@@ -332,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
return 0;
}
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
int node_num;
@@ -354,6 +421,7 @@ static struct ocfs2_stack_operations o2cb_stack_ops = {
.dlm_lock = o2cb_dlm_lock,
.dlm_unlock = o2cb_dlm_unlock,
.lock_status = o2cb_dlm_lock_status,
+ .lvb_valid = o2cb_dlm_lvb_valid,
.lock_lvb = o2cb_dlm_lvb,
.dump_lksb = o2cb_dump_lksb,
};
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76d41a8ac..13a8537d8e8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,11 +21,11 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
-#include <linux/smp_lock.h>
+#include <linux/slab.h>
#include <linux/reboot.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
-#include "ocfs2.h" /* For struct ocfs2_lock_res */
#include "stackglue.h"
#include <linux/dlm_plock.h>
@@ -63,8 +63,8 @@
* negotiated by the client. The client negotiates based on the maximum
* version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
* number from the "SETV" message must match
- * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
- * must be less than or equal to ...->lp_max_version.pv_minor.
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
+ * must be less than or equal to ...sp_max_version.pv_minor.
*
* Once this information has been set, mounts will be allowed. From this
* point on, the "DOWN" message can be sent for node down notification.
@@ -103,6 +103,12 @@
#define OCFS2_TEXT_UUID_LEN 32
#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
/*
* ocfs2_live_connection is refcounted because the filesystem and
@@ -111,6 +117,13 @@
struct ocfs2_live_connection {
struct list_head oc_list;
struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
};
struct ocfs2_control_private {
@@ -191,7 +204,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
return c;
}
- return c;
+ return NULL;
}
/*
@@ -199,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
* mount path. Since the VFS prevents multiple calls to
* fill_super(), we can't get dupes here.
*/
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
- struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
{
int rc = 0;
- struct ocfs2_live_connection *c;
-
- c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
mutex_lock(&ocfs2_control_lock);
c->oc_conn = conn;
- if (atomic_read(&ocfs2_control_opened))
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
list_add(&c->oc_list, &ocfs2_live_connection_list);
else {
printk(KERN_ERR
@@ -221,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
}
mutex_unlock(&ocfs2_control_lock);
-
- if (!rc)
- *c_ret = c;
- else
- kfree(c);
-
return rc;
}
@@ -401,7 +403,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
- &ocfs2_user_plugin.sp_proto->lp_max_version;
+ &ocfs2_user_plugin.sp_max_proto;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -612,12 +614,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
- lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
- unlock_kernel();
return 0;
}
@@ -628,6 +628,7 @@ static const struct file_operations ocfs2_control_fops = {
.read = ocfs2_control_read,
.write = ocfs2_control_write,
.owner = THIS_MODULE,
+ .llseek = default_llseek,
};
static struct miscdevice ocfs2_control_device = {
@@ -664,18 +665,10 @@ static void ocfs2_control_exit(void)
-rc);
}
-static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
-{
- struct ocfs2_lock_res *res = astarg;
- return &res->l_lksb.lksb_fsdlm;
-}
-
static void fsdlm_lock_ast_wrapper(void *astarg)
{
- struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
- int status = lksb->sb_status;
-
- BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
+ struct ocfs2_dlm_lksb *lksb = astarg;
+ int status = lksb->lksb_fsdlm.sb_status;
/*
* For now we're punting on the issue of other non-standard errors
@@ -688,25 +681,24 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
*/
if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
- ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
+ lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
else
- ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
+ lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
}
static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
{
- BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
+ struct ocfs2_dlm_lksb *lksb = astarg;
- ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
+ lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
}
static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
- union ocfs2_dlm_lksb *lksb,
+ struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
- unsigned int namelen,
- void *astarg)
+ unsigned int namelen)
{
int ret;
@@ -716,29 +708,35 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, astarg,
+ fsdlm_lock_ast_wrapper, lksb,
fsdlm_blocking_ast_wrapper);
return ret;
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
- union ocfs2_dlm_lksb *lksb,
- u32 flags,
- void *astarg)
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags)
{
int ret;
ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, astarg);
+ flags, &lksb->lksb_fsdlm, lksb);
return ret;
}
-static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
{
return lksb->lksb_fsdlm.sb_status;
}
-static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+ int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
+
+ return !invalid;
+}
+
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
{
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -746,7 +744,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
}
-static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
{
}
@@ -804,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
return 0;
}
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, 2);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
{
dlm_lockspace_t *fsdlm;
- struct ocfs2_live_connection *control;
- int rc = 0;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
BUG_ON(conn == NULL);
- rc = ocfs2_live_connection_new(conn, &control);
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_FS, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
if (rc)
goto out;
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
+ if (rc)
+ goto out;
+
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
/*
* running_proto must have been set before we allowed any mounts
* to proceed.
@@ -823,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
printk(KERN_ERR
"Unable to mount with fs locking protocol version "
- "%u.%u because the userspace control daemon has "
- "negotiated %u.%u\n",
+ "%u.%u because negotiated protocol is %u.%u\n",
conn->cc_version.pv_major, conn->cc_version.pv_minor,
running_proto.pv_major, running_proto.pv_minor);
rc = -EPROTO;
- ocfs2_live_connection_drop(control);
- goto out;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
}
- rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
- &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
- if (rc) {
- ocfs2_live_connection_drop(control);
- goto out;
- }
-
- conn->cc_private = control;
- conn->cc_lockspace = fsdlm;
out:
+ if (rc && lc)
+ kfree(lc);
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
- dlm_release_lockspace(conn->cc_lockspace, 2);
- conn->cc_lockspace = NULL;
- ocfs2_live_connection_drop(conn->cc_private);
- conn->cc_private = NULL;
- return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
{
int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
- rc = ocfs2_control_get_this_node();
if (rc < 0)
return rc;
@@ -873,6 +1096,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
.dlm_lock = user_dlm_lock,
.dlm_unlock = user_dlm_unlock,
.lock_status = user_dlm_lock_status,
+ .lvb_valid = user_dlm_lvb_valid,
.lock_lvb = user_dlm_lvb,
.plock = user_plock,
.dump_lksb = user_dlm_dump_lksb,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 68b668b0e60..5d965e83bd4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -6,7 +6,7 @@
* Code which implements an OCFS2 specific interface to underlying
* cluster stacks.
*
- * Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2007, 2009 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -36,7 +36,7 @@
#define OCFS2_STACK_PLUGIN_USER "user"
#define OCFS2_MAX_HB_CTL_PATH 256
-static struct ocfs2_locking_protocol *lproto;
+static struct ocfs2_protocol_version locking_max_version;
static DEFINE_SPINLOCK(ocfs2_stack_lock);
static LIST_HEAD(ocfs2_stack_list);
static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
spin_lock(&ocfs2_stack_lock);
if (!ocfs2_stack_lookup(plugin->sp_name)) {
plugin->sp_count = 0;
- plugin->sp_proto = lproto;
+ plugin->sp_max_proto = locking_max_version;
list_add(&plugin->sp_list, &ocfs2_stack_list);
printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
plugin->sp_name);
@@ -213,76 +213,76 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
}
EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
{
struct ocfs2_stack_plugin *p;
- BUG_ON(proto == NULL);
-
spin_lock(&ocfs2_stack_lock);
- BUG_ON(active_stack != NULL);
+ if (memcmp(max_proto, &locking_max_version,
+ sizeof(struct ocfs2_protocol_version))) {
+ BUG_ON(locking_max_version.pv_major != 0);
- lproto = proto;
- list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
- p->sp_proto = lproto;
+ locking_max_version = *max_proto;
+ list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+ p->sp_max_proto = locking_max_version;
+ }
}
-
spin_unlock(&ocfs2_stack_lock);
}
-EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
/*
- * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
- * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
- * underlying stack plugins need to pilfer the lksb off of the lock_res.
- * If some other structure needs to be passed as an astarg, the plugins
- * will need to be given a different avenue to the lksb.
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
+ * for the ast and bast functions. They will pass the lksb to the ast
+ * and bast. The caller can wrap the lksb with their own structure to
+ * get more information.
*/
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
- union ocfs2_dlm_lksb *lksb,
+ struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
- unsigned int namelen,
- struct ocfs2_lock_res *astarg)
+ unsigned int namelen)
{
- BUG_ON(lproto == NULL);
-
+ if (!lksb->lksb_conn)
+ lksb->lksb_conn = conn;
+ else
+ BUG_ON(lksb->lksb_conn != conn);
return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
- name, namelen, astarg);
+ name, namelen);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
- union ocfs2_dlm_lksb *lksb,
- u32 flags,
- struct ocfs2_lock_res *astarg)
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags)
{
- BUG_ON(lproto == NULL);
+ BUG_ON(lksb->lksb_conn == NULL);
- return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+ return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
{
return active_stack->sp_ops->lock_status(lksb);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
-/*
- * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we
- * don't cast at the glue level. The real answer is that the header
- * ordering is nigh impossible.
- */
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
+{
+ return active_stack->sp_ops->lvb_valid(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
+
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
{
return active_stack->sp_ops->lock_lvb(lksb);
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
{
active_stack->sp_ops->dump_lksb(lksb);
}
@@ -309,8 +309,11 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
EXPORT_SYMBOL_GPL(ocfs2_plock);
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
+ struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
@@ -328,6 +331,12 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
+ if (memcmp(&lproto->lp_max_version, &locking_max_version,
+ sizeof(struct ocfs2_protocol_version))) {
+ rc = -EINVAL;
+ goto out;
+ }
+
new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
GFP_KERNEL);
if (!new_conn) {
@@ -335,11 +344,16 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- memcpy(new_conn->cc_name, group, grouplen);
+ strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
+ if (cluster_name_len)
+ strlcpy(new_conn->cc_cluster_name, cluster_name,
+ CLUSTER_NAME_MAX + 1);
+ new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
new_conn->cc_recovery_data = recovery_data;
+ new_conn->cc_proto = lproto;
/* Start the new connection at our maximum compatibility level */
new_conn->cc_version = lproto->lp_max_version;
@@ -365,6 +379,25 @@ out:
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+ int grouplen,
+ struct ocfs2_locking_protocol *lproto,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn)
+{
+ char *stack_name = NULL;
+
+ if (cluster_stack_name[0])
+ stack_name = cluster_stack_name;
+ return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+ lproto, recovery_handler, recovery_data,
+ conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
+
/* If hangup_pending is 0, the stack driver will be dropped */
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending)
@@ -434,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
- return active_stack->sp_ops->this_node(node);
+ return active_stack->sp_ops->this_node(conn, node);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
@@ -452,17 +486,17 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
ssize_t ret = 0;
spin_lock(&ocfs2_stack_lock);
- if (lproto)
+ if (locking_max_version.pv_major)
ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
- lproto->lp_max_version.pv_major,
- lproto->lp_max_version.pv_minor);
+ locking_max_version.pv_major,
+ locking_max_version.pv_minor);
spin_unlock(&ocfs2_stack_lock);
return ret;
}
static struct kobj_attribute ocfs2_attr_max_locking_protocol =
- __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+ __ATTR(max_locking_protocol, S_IRUGO,
ocfs2_max_locking_protocol_show, NULL);
static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -494,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
- __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+ __ATTR(loaded_cluster_plugins, S_IRUGO,
ocfs2_loaded_cluster_plugins_show, NULL);
static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -516,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
- __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+ __ATTR(active_cluster_plugin, S_IRUGO,
ocfs2_active_cluster_plugin_show, NULL);
static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -565,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
static struct kobj_attribute ocfs2_attr_cluster_stack =
- __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+ __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
ocfs2_cluster_stack_show,
ocfs2_cluster_stack_store);
+
+
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "1\n");
+}
+
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+ __ATTR(dlm_recover_callback_support, S_IRUGO,
+ ocfs2_dlm_recover_show, NULL);
+
static struct attribute *ocfs2_attrs[] = {
&ocfs2_attr_max_locking_protocol.attr,
&ocfs2_attr_loaded_cluster_plugins.attr,
&ocfs2_attr_active_cluster_plugin.attr,
&ocfs2_attr_cluster_stack.attr,
+ &ocfs2_attr_dlm_recover_support.attr,
NULL,
};
@@ -617,56 +665,51 @@ error:
#define FS_OCFS2_NM 1
-static ctl_table ocfs2_nm_table[] = {
+static struct ctl_table ocfs2_nm_table[] = {
{
- .ctl_name = 1,
.procname = "hb_ctl_path",
.data = ocfs2_hb_ctl_path,
.maxlen = OCFS2_MAX_HB_CTL_PATH,
.mode = 0644,
- .proc_handler = &proc_dostring,
- .strategy = &sysctl_string,
+ .proc_handler = proc_dostring,
},
- { .ctl_name = 0 }
+ { }
};
-static ctl_table ocfs2_mod_table[] = {
+static struct ctl_table ocfs2_mod_table[] = {
{
- .ctl_name = FS_OCFS2_NM,
.procname = "nm",
.data = NULL,
.maxlen = 0,
.mode = 0555,
.child = ocfs2_nm_table
},
- { .ctl_name = 0}
+ { }
};
-static ctl_table ocfs2_kern_table[] = {
+static struct ctl_table ocfs2_kern_table[] = {
{
- .ctl_name = FS_OCFS2,
.procname = "ocfs2",
.data = NULL,
.maxlen = 0,
.mode = 0555,
.child = ocfs2_mod_table
},
- { .ctl_name = 0}
+ { }
};
-static ctl_table ocfs2_root_table[] = {
+static struct ctl_table ocfs2_root_table[] = {
{
- .ctl_name = CTL_FS,
.procname = "fs",
.data = NULL,
.maxlen = 0,
.mode = 0555,
.child = ocfs2_kern_table
},
- { .ctl_name = 0 }
+ { }
};
-static struct ctl_table_header *ocfs2_table_header = NULL;
+static struct ctl_table_header *ocfs2_table_header;
/*
@@ -689,7 +732,10 @@ static int __init ocfs2_stack_glue_init(void)
static void __exit ocfs2_stack_glue_exit(void)
{
- lproto = NULL;
+ memset(&locking_max_version, 0,
+ sizeof(struct ocfs2_protocol_version));
+ locking_max_version.pv_major = 0;
+ locking_max_version.pv_minor = 0;
ocfs2_sysfs_exit();
if (ocfs2_table_header)
unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index c571af375ef..66334a30cea 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
*/
#define GROUP_NAME_MAX 64
+/* This shadows OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX 16
+
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
@@ -56,17 +59,6 @@ struct ocfs2_protocol_version {
};
/*
- * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
- */
-struct ocfs2_locking_protocol {
- struct ocfs2_protocol_version lp_max_version;
- void (*lp_lock_ast)(void *astarg);
- void (*lp_blocking_ast)(void *astarg, int level);
- void (*lp_unlock_ast)(void *astarg, int error);
-};
-
-
-/*
* The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
* has a pointer to separately allocated lvb space. This struct exists only to
* include in the lksb union to make space for a combined dlm_lksb and lvb.
@@ -81,21 +73,39 @@ struct fsdlm_lksb_plus_lvb {
* size of the union is known. Lock status structures are embedded in
* ocfs2 inodes.
*/
-union ocfs2_dlm_lksb {
- struct dlm_lockstatus lksb_o2dlm;
- struct dlm_lksb lksb_fsdlm;
- struct fsdlm_lksb_plus_lvb padding;
+struct ocfs2_cluster_connection;
+struct ocfs2_dlm_lksb {
+ union {
+ struct dlm_lockstatus lksb_o2dlm;
+ struct dlm_lksb lksb_fsdlm;
+ struct fsdlm_lksb_plus_lvb padding;
+ };
+ struct ocfs2_cluster_connection *lksb_conn;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+ struct ocfs2_protocol_version lp_max_version;
+ void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+ void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+ void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
};
+
/*
* A cluster connection. Mostly opaque to ocfs2, the connection holds
* state for the underlying stack. ocfs2 does use cc_version to determine
* locking compatibility.
*/
struct ocfs2_cluster_connection {
- char cc_name[GROUP_NAME_MAX];
+ char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
+ char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+ int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
+ struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
void *cc_recovery_data;
void *cc_lockspace;
@@ -121,7 +131,7 @@ struct ocfs2_stack_operations {
*
* ->connect() must not return until it is guaranteed that
*
- * - Node down notifications for the filesystem will be recieved
+ * - Node down notifications for the filesystem will be received
* and passed to conn->cc_recovery_handler().
* - Locking requests for the filesystem will be processed.
*/
@@ -147,7 +157,8 @@ struct ocfs2_stack_operations {
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
- int (*this_node)(unsigned int *node);
+ int (*this_node)(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
@@ -155,27 +166,29 @@ struct ocfs2_stack_operations {
*
* ast and bast functions are not part of the call because the
* stack will likely want to wrap ast and bast calls before passing
- * them to stack->sp_proto.
+ * them to stack->sp_proto. There is no astarg. The lksb will
+ * be passed back to the ast and bast functions. The caller can
+ * use this to find their object.
*/
int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
int mode,
- union ocfs2_dlm_lksb *lksb,
+ struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
- unsigned int namelen,
- void *astarg);
+ unsigned int namelen);
/*
* Call the underlying dlm unlock function. The ->dlm_unlock()
* function should convert the flags as appropriate.
*
* The unlock ast is not passed, as the stack will want to wrap
- * it before calling stack->sp_proto->lp_unlock_ast().
+ * it before calling stack->sp_proto->lp_unlock_ast(). There is
+ * no astarg. The lksb will be passed back to the unlock ast
+ * function. The caller can use this to find their object.
*/
int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
- union ocfs2_dlm_lksb *lksb,
- u32 flags,
- void *astarg);
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags);
/*
* Return the status of the current lock status block. The fs
@@ -183,12 +196,17 @@ struct ocfs2_stack_operations {
* callback pulls out the stack-specific lksb, converts the status
* to a proper errno, and returns it.
*/
- int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+ int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
+
+ /*
+ * Return non-zero if the LVB is valid.
+ */
+ int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
/*
* Pull the lvb pointer off of the stack-specific lksb.
*/
- void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+ void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
/*
* Cluster-aware posix locks
@@ -205,7 +223,7 @@ struct ocfs2_stack_operations {
* This is an optoinal debugging hook. If provided, the
* stack can dump debugging information about this lock.
*/
- void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+ void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
};
/*
@@ -221,45 +239,59 @@ struct ocfs2_stack_plugin {
/* These are managed by the stackglue code. */
struct list_head sp_list;
unsigned int sp_count;
- struct ocfs2_locking_protocol *sp_proto;
+ struct ocfs2_protocol_version sp_max_proto;
};
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
+ struct ocfs2_locking_protocol *lproto,
void (*recovery_handler)(int node_num,
void *recovery_data),
void *recovery_data,
struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name. They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+ int grouplen,
+ struct ocfs2_locking_protocol *lproto,
+ void (*recovery_handler)(int node_num,
+ void *recovery_data),
+ void *recovery_data,
+ struct ocfs2_cluster_connection **conn);
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
int mode,
- union ocfs2_dlm_lksb *lksb,
+ struct ocfs2_dlm_lksb *lksb,
u32 flags,
void *name,
- unsigned int namelen,
- struct ocfs2_lock_res *astarg);
+ unsigned int namelen);
int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
- union ocfs2_dlm_lksb *lksb,
- u32 flags,
- struct ocfs2_lock_res *astarg);
+ struct ocfs2_dlm_lksb *lksb,
+ u32 flags);
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
int ocfs2_stack_supports_plocks(void);
int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
struct file *file, int cmd, struct file_lock *fl);
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
/* Used by stack plugins */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8439f6b324b..0cb889a17ae 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -44,6 +43,7 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -51,7 +51,33 @@
#define ALLOC_NEW_GROUP 0x1
#define ALLOC_GROUPS_FROM_GLOBAL 0x2
-#define OCFS2_MAX_INODES_TO_STEAL 1024
+#define OCFS2_MAX_TO_STEAL 1024
+
+struct ocfs2_suballoc_result {
+ u64 sr_bg_blkno; /* The bg we allocated from. Set
+ to 0 when a block group is
+ contiguous. */
+ u64 sr_bg_stable_blkno; /*
+ * Doesn't change, always
+ * set to target block
+ * group descriptor
+ * block.
+ */
+ u64 sr_blkno; /* The first allocated block */
+ unsigned int sr_bit_offset; /* The bit in the bg */
+ unsigned int sr_bits; /* How many bits we claimed */
+};
+
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+ if (res->sr_blkno == 0)
+ return 0;
+
+ if (res->sr_bg_blkno)
+ return res->sr_bg_blkno;
+
+ return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -60,6 +86,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl);
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,35 +100,19 @@ static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+ struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
-
static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
@@ -137,6 +148,11 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
}
brelse(ac->ac_bh);
ac->ac_bh = NULL;
+ ac->ac_resv = NULL;
+ if (ac->ac_find_loc_priv) {
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
+ }
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +168,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
#define do_error(fmt, ...) \
do{ \
- if (clean_error) \
+ if (resize) \
mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
else \
ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +176,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
- int clean_error)
+ int resize)
{
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -211,7 +227,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_dinode *di,
struct buffer_head *bh,
- int clean_error)
+ int resize)
{
unsigned int max_bits;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +249,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
return -EINVAL;
}
- if (le16_to_cpu(gd->bg_chain) >=
- le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+ /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+ if ((le16_to_cpu(gd->bg_chain) >
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+ ((le16_to_cpu(gd->bg_chain) ==
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
do_error("Group descriptor #%llu has bad chain %u",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
@@ -283,8 +302,8 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb,
int rc;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
- mlog(0, "Validating group descriptor %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_group_descriptor(
+ (unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -310,7 +329,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+ rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
ocfs2_validate_group_descriptor);
if (rc)
goto out;
@@ -329,19 +348,41 @@ out:
return rc;
}
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_chain_list *cl,
+ u64 p_blkno, unsigned int clusters)
+{
+ struct ocfs2_extent_list *el = &bg->bg_list;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(!ocfs2_supports_discontig_bg(osb));
+ if (!el->l_next_free_rec)
+ el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+ rec->e_blkno = cpu_to_le64(p_blkno);
+ rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+ le16_to_cpu(cl->cl_bpc));
+ rec->e_leaf_clusters = cpu_to_le16(clusters);
+ le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&bg->bg_free_bits_count,
+ clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct super_block * sb = alloc_inode->i_sb;
- mlog_entry_void();
-
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
"b_blocknr (%llu)",
@@ -352,7 +393,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
}
status = ocfs2_journal_access_gd(handle,
- alloc_inode,
+ INODE_CACHE(alloc_inode),
bg_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
@@ -363,19 +404,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
- bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
- bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+ osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
bg->bg_blkno = cpu_to_le64(group_blkno);
+ if (group_clusters == le16_to_cpu(cl->cl_cpg))
+ bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ else
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+ group_clusters);
+
/* set the 1st bit in the bitmap to account for the descriptor block */
ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bg_bh);
/* There is no need to zero out or otherwise initialize the
* other blocks in a group - All valid FS metadata in a block
@@ -383,7 +428,8 @@ static int ocfs2_block_group_fill(handle_t *handle,
* allocation time. */
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -401,6 +447,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
return best;
}
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ struct buffer_head *bg_bh;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+ status = ocfs2_claim_clusters(handle, ac,
+ le16_to_cpu(cl->cl_cpg), &bit_off,
+ &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_block_group_alloc_contig(
+ (unsigned long long)bg_blkno, alloc_rec);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
+ brelse(bg_bh);
+ mlog_errno(status);
+ }
+
+bail:
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ unsigned int min_bits,
+ u32 *bit_off, u32 *num_bits)
+{
+ int status = 0;
+
+ while (min_bits) {
+ status = ocfs2_claim_clusters(handle, ac, min_bits,
+ bit_off, num_bits);
+ if (status != -ENOSPC)
+ break;
+
+ min_bits >>= 1;
+ }
+
+ return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl,
+ unsigned int min_bits)
+{
+ int status;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+ struct ocfs2_group_desc *bg =
+ (struct ocfs2_group_desc *)bg_bh->b_data;
+ unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ u32 p_cpos, clusters;
+ u64 p_blkno;
+ struct ocfs2_extent_list *el = &bg->bg_list;
+
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+ le16_to_cpu(el->l_count))) {
+ if (min_bits > needed)
+ min_bits = needed;
+ status = ocfs2_block_group_claim_bits(osb, handle, ac,
+ min_bits, &p_cpos,
+ &clusters);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+ clusters);
+
+ min_bits = clusters;
+ needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ }
+
+ if (needed > 0) {
+ /*
+ * We have used up all the extent rec but can't fill up
+ * the cpg. So bail out.
+ */
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+ return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+ struct ocfs2_alloc_context *cluster_ac,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh)
+{
+ int i, ret;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ if (!bg_bh)
+ return;
+
+ bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+ el = &bg->bg_list;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+ ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+ cluster_ac->ac_bh,
+ le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(rec->e_leaf_clusters));
+ if (ret)
+ mlog_errno(ret);
+ /* Try all the clusters to free */
+ }
+
+ ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+ brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+ struct buffer_head *bg_bh = NULL;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+ if (!ocfs2_supports_discontig_bg(osb)) {
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ status = ocfs2_extend_trans(handle,
+ ocfs2_calc_bg_discontig_credits(osb->sb));
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /*
+ * We're going to be grabbing from multiple cluster groups.
+ * We don't have enough credits to relink them all, and the
+ * cluster groups will be staying in cache for the duration of
+ * this operation.
+ */
+ ac->ac_disable_chain_relink = 1;
+
+ /* Claim the first region */
+ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+ &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ min_bits = num_bits;
+
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_block_group_alloc_discontig(
+ (unsigned long long)bg_blkno, alloc_rec);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+ bg_bh, ac, cl, min_bits);
+ if (status)
+ mlog_errno(status);
+
+bail:
+ if (status)
+ ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
/*
* We expect the block group allocator to already be locked.
*/
@@ -416,16 +694,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct ocfs2_chain_list *cl;
struct ocfs2_alloc_context *ac = NULL;
handle_t *handle = NULL;
- u32 bit_off, num_bits;
u16 alloc_rec;
- u64 bg_blkno;
struct buffer_head *bg_bh = NULL;
struct ocfs2_group_desc *bg;
BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
- mlog_entry_void();
-
cl = &fe->id2.i_chain;
status = ocfs2_reserve_clusters_with_limit(osb,
le16_to_cpu(cl->cl_cpg),
@@ -447,61 +721,39 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
}
if (last_alloc_group && *last_alloc_group != 0) {
- mlog(0, "use old allocation group %llu for block group alloc\n",
- (unsigned long long)*last_alloc_group);
+ trace_ocfs2_block_group_alloc(
+ (unsigned long long)*last_alloc_group);
ac->ac_last_group = *last_alloc_group;
}
- status = ocfs2_claim_clusters(osb,
- handle,
- ac,
- le16_to_cpu(cl->cl_cpg),
- &bit_off,
- &num_bits);
- if (status < 0) {
+
+ bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+ bg_bh = ocfs2_block_group_alloc_discontig(handle,
+ alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh)) {
+ status = PTR_ERR(bg_bh);
+ bg_bh = NULL;
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
-
- alloc_rec = ocfs2_find_smallest_chain(cl);
-
- /* setup the group */
- bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "new descriptor, record %u, at block %llu\n",
- alloc_rec, (unsigned long long)bg_blkno);
-
- bg_bh = sb_getblk(osb->sb, bg_blkno);
- if (!bg_bh) {
- status = -EIO;
- mlog_errno(status);
- goto bail;
- }
- ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
-
- status = ocfs2_block_group_fill(handle,
- alloc_inode,
- bg_bh,
- bg_blkno,
- alloc_rec,
- cl);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- status = ocfs2_journal_access_di(handle, alloc_inode,
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ alloc_rec = le16_to_cpu(bg->bg_chain);
le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
le16_to_cpu(bg->bg_free_bits_count));
- le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
- cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
+ le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+ le16_to_cpu(bg->bg_bits));
+ cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -510,11 +762,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -523,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+ ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
status = 0;
@@ -539,7 +788,8 @@ bail:
brelse(bg_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -557,8 +807,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *fe;
u32 free_bits;
- mlog_entry_void();
-
alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
if (!alloc_inode) {
mlog_errno(-EINVAL);
@@ -598,16 +846,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
if (bits_wanted > free_bits) {
/* cluster bitmap never grows */
if (ocfs2_is_cluster_bitmap(alloc_inode)) {
- mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
- bits_wanted, free_bits);
+ trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+ free_bits);
status = -ENOSPC;
goto bail;
}
if (!(flags & ALLOC_NEW_GROUP)) {
- mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
- "and we don't alloc a new group for it.\n",
- slot, bits_wanted, free_bits);
+ trace_ocfs2_reserve_suballoc_bits_no_new_group(
+ slot, bits_wanted, free_bits);
status = -ENOSPC;
goto bail;
}
@@ -633,16 +880,118 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
bail:
brelse(bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+ spin_lock(&osb->osb_lock);
+ osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+ spin_lock(&osb->osb_lock);
+ osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ atomic_set(&osb->s_num_meta_stolen, 0);
+}
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+ ocfs2_init_inode_steal_slot(osb);
+ ocfs2_init_meta_steal_slot(osb);
+}
+
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+ spin_lock(&osb->osb_lock);
+ if (type == INODE_ALLOC_SYSTEM_INODE)
+ osb->s_inode_steal_slot = slot;
+ else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+ osb->s_meta_steal_slot = slot;
+ spin_unlock(&osb->osb_lock);
+}
+
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+ int slot = OCFS2_INVALID_SLOT;
+
+ spin_lock(&osb->osb_lock);
+ if (type == INODE_ALLOC_SYSTEM_INODE)
+ slot = osb->s_inode_steal_slot;
+ else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+ slot = osb->s_meta_steal_slot;
+ spin_unlock(&osb->osb_lock);
+
+ return slot;
+}
+
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+ return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+ return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac,
+ int type)
+{
+ int i, status = -ENOSPC;
+ int slot = __ocfs2_get_steal_slot(osb, type);
+
+ /* Start to steal resource from the first slot after ours. */
+ if (slot == OCFS2_INVALID_SLOT)
+ slot = osb->slot_num + 1;
+
+ for (i = 0; i < osb->max_slots; i++, slot++) {
+ if (slot == osb->max_slots)
+ slot = 0;
+
+ if (slot == osb->slot_num)
+ continue;
+
+ status = ocfs2_reserve_suballoc_bits(osb, ac,
+ type,
+ (u32)slot, NULL,
+ NOT_ALLOC_NEW_GROUP);
+ if (status >= 0) {
+ __ocfs2_set_steal_slot(osb, slot, type);
+ break;
+ }
+
+ ocfs2_free_ac_resource(ac);
+ }
+
return status;
}
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac)
+{
+ return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+ struct ocfs2_alloc_context *ac)
+{
+ return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
int blocks,
struct ocfs2_alloc_context **ac)
{
int status;
- u32 slot;
+ int slot = ocfs2_get_meta_steal_slot(osb);
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
@@ -653,12 +1002,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
(*ac)->ac_bits_wanted = blocks;
(*ac)->ac_which = OCFS2_AC_USE_META;
- slot = osb->slot_num;
(*ac)->ac_group_search = ocfs2_block_group_search;
+ if (slot != OCFS2_INVALID_SLOT &&
+ atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+ goto extent_steal;
+
+ atomic_set(&osb->s_num_meta_stolen, 0);
status = ocfs2_reserve_suballoc_bits(osb, (*ac),
EXTENT_ALLOC_SYSTEM_INODE,
- slot, NULL, ALLOC_NEW_GROUP);
+ (u32)osb->slot_num, NULL,
+ ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
+
+
+ if (status >= 0) {
+ status = 0;
+ if (slot != OCFS2_INVALID_SLOT)
+ ocfs2_init_meta_steal_slot(osb);
+ goto bail;
+ } else if (status < 0 && status != -ENOSPC) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_free_ac_resource(*ac);
+
+extent_steal:
+ status = ocfs2_steal_meta(osb, *ac);
+ atomic_inc(&osb->s_num_meta_stolen);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -672,7 +1043,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -685,43 +1057,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
ac);
}
-static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac)
-{
- int i, status = -ENOSPC;
- s16 slot = ocfs2_get_inode_steal_slot(osb);
-
- /* Start to steal inodes from the first slot after ours. */
- if (slot == OCFS2_INVALID_SLOT)
- slot = osb->slot_num + 1;
-
- for (i = 0; i < osb->max_slots; i++, slot++) {
- if (slot == osb->max_slots)
- slot = 0;
-
- if (slot == osb->slot_num)
- continue;
-
- status = ocfs2_reserve_suballoc_bits(osb, ac,
- INODE_ALLOC_SYSTEM_INODE,
- slot, NULL,
- NOT_ALLOC_NEW_GROUP);
- if (status >= 0) {
- ocfs2_set_inode_steal_slot(osb, slot);
- break;
- }
-
- ocfs2_free_ac_resource(ac);
- }
-
- return status;
-}
-
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac)
{
int status;
- s16 slot = ocfs2_get_inode_steal_slot(osb);
+ int slot = ocfs2_get_inode_steal_slot(osb);
u64 alloc_group;
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +1094,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
* need to check our slots to see whether there is some space for us.
*/
if (slot != OCFS2_INVALID_SLOT &&
- atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+ atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
goto inode_steal;
atomic_set(&osb->s_num_inodes_stolen, 0);
alloc_group = osb->osb_inode_alloc_group;
status = ocfs2_reserve_suballoc_bits(osb, *ac,
INODE_ALLOC_SYSTEM_INODE,
- osb->slot_num,
+ (u32)osb->slot_num,
&alloc_group,
ALLOC_NEW_GROUP |
ALLOC_GROUPS_FROM_GLOBAL);
@@ -771,8 +1111,8 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
spin_lock(&osb->osb_lock);
osb->osb_inode_alloc_group = alloc_group;
spin_unlock(&osb->osb_lock);
- mlog(0, "after reservation, new allocation group is "
- "%llu\n", (unsigned long long)alloc_group);
+ trace_ocfs2_reserve_new_inode_new_group(
+ (unsigned long long)alloc_group);
/*
* Some inodes must be freed by us, so try to allocate
@@ -789,7 +1129,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
ocfs2_free_ac_resource(*ac);
inode_steal:
- status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+ status = ocfs2_steal_inode(osb, *ac);
atomic_inc(&osb->s_num_inodes_stolen);
if (status < 0) {
if (status != -ENOSPC)
@@ -804,7 +1144,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -841,8 +1182,6 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
{
int status;
- mlog_entry_void();
-
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
@@ -859,11 +1198,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
status = ocfs2_reserve_local_alloc_bits(osb,
bits_wanted,
*ac);
- if (status == -EFBIG) {
- /* The local alloc window is outside ac_max_block.
- * use the main bitmap. */
- status = -ENOSPC;
- } else if ((status < 0) && (status != -ENOSPC)) {
+ if ((status < 0) && (status != -ENOSPC)) {
mlog_errno(status);
goto bail;
}
@@ -885,7 +1220,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -923,22 +1259,30 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr)
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
- if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
+
+ if (!buffer_jbd(bg_bh))
return 1;
+ jbd_lock_bh_state(bg_bh);
bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
- return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ jbd_unlock_bh_state(bg_bh);
+
+ return ret;
}
static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
struct buffer_head *bg_bh,
unsigned int bits_wanted,
unsigned int total_bits,
- u16 *bit_off,
- u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
void *bitmap;
u16 best_offset, best_size;
@@ -982,14 +1326,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
}
}
- /* XXX: I think the first clause is equivalent to the second
- * - jlbec */
- if (found == bits_wanted) {
- *bit_off = start - found;
- *bits_found = found;
- } else if (best_size) {
- *bit_off = best_offset;
- *bits_found = best_size;
+ if (best_size) {
+ res->sr_bit_offset = best_offset;
+ res->sr_bits = best_size;
} else {
status = -ENOSPC;
/* No error log here -- see the comment above
@@ -999,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
return status;
}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
@@ -1010,21 +1349,18 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
void *bitmap = bg->bg_bitmap;
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
- mlog_entry_void();
-
/* All callers get the descriptor via
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
- mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
- num_bits);
+ trace_ocfs2_block_group_set_bits(bit_off, num_bits);
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
status = ocfs2_journal_access_gd(handle,
- alloc_inode,
+ INODE_CACHE(alloc_inode),
group_bh,
journal_type);
if (status < 0) {
@@ -1033,19 +1369,20 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
- status = ocfs2_journal_dirty(handle,
- group_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, group_bh);
bail:
- mlog_exit(status);
return status;
}
@@ -1078,7 +1415,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
int status;
/* there is a really tiny chance the journal calls could fail,
* but we wouldn't want inconsistent blocks in *any* case. */
- u64 fe_ptr, bg_ptr, prev_bg_ptr;
+ u64 bg_ptr, prev_bg_ptr;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1088,70 +1425,49 @@ static int ocfs2_relink_block_group(handle_t *handle,
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
- mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+ trace_ocfs2_relink_block_group(
+ (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
- fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
- status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ prev_bg_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out;
prev_bg->bg_next_group = bg->bg_next_group;
+ ocfs2_journal_dirty(handle, prev_bg_bh);
- status = ocfs2_journal_dirty(handle, prev_bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
-
- status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0)
+ goto out_rollback_prev_bg;
bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+ ocfs2_journal_dirty(handle, bg_bh);
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
-
- status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0)
+ goto out_rollback_bg;
fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+ ocfs2_journal_dirty(handle, fe_bh);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
+out:
+ if (status < 0)
mlog_errno(status);
- goto out_rollback;
- }
-
- status = 0;
-out_rollback:
- if (status < 0) {
- fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
- bg->bg_next_group = cpu_to_le64(bg_ptr);
- prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
- }
-
- mlog_exit(status);
return status;
+
+out_rollback_bg:
+ bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+ prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+ goto out;
}
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
@@ -1166,14 +1482,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
int search = -ENOSPC;
int ret;
u64 blkoff;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- u16 tmp_off, tmp_found;
unsigned int max_bits, gd_cluster_off;
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1182,7 +1497,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
max_bits = le16_to_cpu(gd->bg_bits);
/* Tail groups in cluster bitmaps which aren't cpg
- * aligned are prone to partial extention by a failed
+ * aligned are prone to partial extension by a failed
* fs resize. If the file system resize never got to
* update the dinode cluster count, then we don't want
* to trust any clusters past it, regardless of what
@@ -1192,26 +1507,26 @@ static int ocfs2_cluster_group_search(struct inode *inode,
if ((gd_cluster_off + max_bits) >
OCFS2_I(inode)->ip_clusters) {
max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
- mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- OCFS2_I(inode)->ip_clusters, max_bits);
+ trace_ocfs2_cluster_group_search_wrong_max_bits(
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ OCFS2_I(inode)->ip_clusters, max_bits);
}
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
- max_bits,
- &tmp_off, &tmp_found);
+ max_bits, res);
if (ret)
return ret;
if (max_block) {
blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
gd_cluster_off +
- tmp_off + tmp_found);
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)blkoff,
- (unsigned long long)max_block);
+ res->sr_bit_offset +
+ res->sr_bits);
+ trace_ocfs2_cluster_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
if (blkoff > max_block)
return -ENOSPC;
}
@@ -1220,16 +1535,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
* return success, but we still want to return
* -ENOSPC unless it found the minimum number
* of bits. */
- if (min_bits <= tmp_found) {
- *bit_off = tmp_off;
- *bits_found = tmp_found;
+ if (min_bits <= res->sr_bits)
search = 0; /* success */
- } else if (tmp_found) {
+ else if (res->sr_bits) {
/*
* Don't show bits which we'll be returning
* for allocation to the local alloc bitmap.
*/
- ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+ ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
}
}
@@ -1240,7 +1553,7 @@ static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
int ret = -ENOSPC;
u64 blkoff;
@@ -1253,13 +1566,13 @@ static int ocfs2_block_group_search(struct inode *inode,
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
- bit_off, bits_found);
+ res);
if (!ret && max_block) {
- blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
- *bits_found;
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)blkoff,
- (unsigned long long)max_block);
+ blkoff = le64_to_cpu(bg->bg_blkno) +
+ res->sr_bit_offset + res->sr_bits;
+ trace_ocfs2_block_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
if (blkoff > max_block)
ret = -ENOSPC;
}
@@ -1268,7 +1581,7 @@ static int ocfs2_block_group_search(struct inode *inode,
return ret;
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
@@ -1279,7 +1592,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
@@ -1289,33 +1602,91 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out:
return ret;
}
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain)
+{
+ u32 tmp_used;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ struct ocfs2_chain_list *cl;
+
+ cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+ tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+ di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
+
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_chain_list *cl)
+{
+ unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+ unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+ unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
+
+ if (res->sr_bit_offset < bitoff)
+ return 0;
+ if (res->sr_bit_offset >= (bitoff + bitcount))
+ return 0;
+ res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+ (res->sr_bit_offset - bitoff);
+ if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+ res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+ return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_suballoc_result *res)
+{
+ int i;
+ u64 bg_blkno = res->sr_bg_blkno; /* Save off */
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+ struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+ res->sr_blkno = 0;
+ return;
+ }
+
+ res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+ res->sr_bg_blkno = 0; /* Clear it for contig block groups */
+ if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+ !bg->bg_list.l_next_free_rec)
+ return;
+
+ for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+ rec = &bg->bg_list.l_recs[i];
+ if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+ res->sr_bg_blkno = bg_blkno; /* Restore */
+ break;
+ }
+ }
+}
+
static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 gd_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int ret;
- u16 found;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *gd;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
- &group_bh);
+ ret = ocfs2_read_group_descriptor(alloc_inode, di,
+ res->sr_bg_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1323,17 +1694,27 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
gd = (struct ocfs2_group_desc *) group_bh->b_data;
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
- ac->ac_max_block, bit_off, &found);
+ ac->ac_max_block, res);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
- *num_bits = found;
+ if (!ret)
+ ocfs2_bg_discontig_fix_result(ac, gd, res);
+
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
- *num_bits,
+ res->sr_bits,
le16_to_cpu(gd->bg_chain));
if (ret < 0) {
mlog_errno(ret);
@@ -1341,10 +1722,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
- *bit_off, *num_bits);
- if (ret < 0)
+ res->sr_bit_offset, res->sr_bits);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+ res->sr_bits,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
+out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
out:
@@ -1357,14 +1743,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int status;
- u16 chain, tmp_bits;
- u32 tmp_used;
+ u16 chain;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
@@ -1374,9 +1757,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
struct ocfs2_group_desc *bg;
chain = ac->ac_chain;
- mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
- bits_wanted, chain,
- (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
+ trace_ocfs2_search_chain_begin(
+ (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+ bits_wanted, chain);
status = ocfs2_read_group_descriptor(alloc_inode, fe,
le64_to_cpu(cl->cl_recs[chain].c_blkno),
@@ -1392,8 +1775,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* the 1st group with any empty bits. */
while ((status = ac->ac_group_search(alloc_inode, group_bh,
bits_wanted, min_bits,
- ac->ac_max_block, bit_off,
- &tmp_bits)) == -ENOSPC) {
+ ac->ac_max_block,
+ res)) == -ENOSPC) {
if (!bg->bg_next_group)
break;
@@ -1417,12 +1800,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
goto bail;
}
- mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
- tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+ trace_ocfs2_search_chain_succ(
+ (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
- *num_bits = tmp_bits;
+ res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
- BUG_ON(*num_bits == 0);
+ BUG_ON(res->sr_bits == 0);
+ if (!status)
+ ocfs2_bg_discontig_fix_result(ac, bg, res);
+
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
/*
* Keep track of previous block descriptor read. When
@@ -1437,9 +1828,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* Do this *after* figuring out how many bits we're taking out
* of our target group.
*/
- if (ac->ac_allow_chain_relink &&
+ if (!ac->ac_disable_chain_relink &&
(prev_group_bh) &&
- (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+ (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
status = ocfs2_relink_block_group(handle, alloc_inode,
ac->ac_bh, group_bh,
prev_group_bh, chain);
@@ -1449,24 +1840,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
}
- /* Ok, claim our bits now: set the info on dinode, chainlist
- * and then the group */
- status = ocfs2_journal_access_di(handle,
- alloc_inode,
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
- fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
- status = ocfs2_journal_dirty(handle,
- ac->ac_bh);
- if (status < 0) {
+ status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1475,45 +1855,44 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
alloc_inode,
bg,
group_bh,
- *bit_off,
- *num_bits);
+ res->sr_bit_offset,
+ res->sr_bits);
if (status < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(status);
goto bail;
}
- mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
- (unsigned long long)le64_to_cpu(fe->i_blkno));
+ trace_ocfs2_search_chain_end(
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ res->sr_bits);
- *bg_blkno = le64_to_cpu(bg->bg_blkno);
+out_loc_only:
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
brelse(group_bh);
brelse(prev_group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
/* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno)
+ struct ocfs2_suballoc_result *res)
{
int status;
u16 victim, i;
u16 bits_left = 0;
- u64 hint_blkno = ac->ac_last_group;
+ u64 hint = ac->ac_last_group;
struct ocfs2_chain_list *cl;
struct ocfs2_dinode *fe;
- mlog_entry_void();
-
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
BUG_ON(!ac->ac_bh);
@@ -1526,7 +1905,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+ ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used "
"bits but only %u total.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1535,22 +1915,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
- if (hint_blkno) {
+ res->sr_bg_blkno = hint;
+ if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
- * allocation group. This helps us mantain some
+ * allocation group. This helps us maintain some
* contiguousness across allocations. */
status = ocfs2_search_one_group(ac, handle, bits_wanted,
- min_bits, bit_off, num_bits,
- hint_blkno, &bits_left);
- if (!status) {
- /* Be careful to update *bg_blkno here as the
- * caller is expecting it to be filled in, and
- * ocfs2_search_one_group() won't do that for
- * us. */
- *bg_blkno = hint_blkno;
+ min_bits, res, &bits_left);
+ if (!status)
goto set_hint;
- }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1561,25 +1935,25 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
- ac->ac_allow_chain_relink = 1;
- status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
- num_bits, bg_blkno, &bits_left);
- if (!status)
+ status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+ res, &bits_left);
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
goto set_hint;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Search of victim chain %u came up with nothing, "
- "trying all chains now.\n", victim);
+ trace_ocfs2_claim_suballoc_bits(victim);
/* If we didn't pick a good victim, then just default to
* searching each chain in order. Don't allow chain relinking
* because we only calculate enough journal credits for one
* relink per alloc. */
- ac->ac_allow_chain_relink = 0;
+ ac->ac_disable_chain_relink = 1;
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
if (i == victim)
continue;
@@ -1588,10 +1962,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
ac->ac_chain = i;
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
- bit_off, num_bits, bg_blkno,
- &bits_left);
- if (!status)
+ res, &bits_left);
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
break;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1606,56 +1981,58 @@ set_hint:
if (bits_left < min_bits)
ac->ac_last_group = 0;
else
- ac->ac_last_group = *bg_blkno;
+ ac->ac_last_group = hint;
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
unsigned int *num_bits,
u64 *blkno_start)
{
int status;
- u64 bg_blkno;
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
BUG_ON(!ac);
BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
bits_wanted,
1,
- suballoc_bit_start,
- num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
- ac->ac_bits_given += (*num_bits);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit_start = res.sr_bit_offset;
+ *blkno_start = res.sr_blkno;
+ ac->ac_bits_given += res.sr_bits;
+ *num_bits = res.sr_bits;
status = 0;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
static void ocfs2_init_inode_ac_group(struct inode *dir,
- struct buffer_head *parent_fe_bh,
+ struct buffer_head *parent_di_bh,
struct ocfs2_alloc_context *ac)
{
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
/*
* Try to allocate inodes from some specific group.
*
@@ -1669,10 +2046,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
if (OCFS2_I(dir)->ip_last_used_group &&
OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
- else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
- ac->ac_last_group = ocfs2_which_suballoc_group(
- le64_to_cpu(fe->i_blkno),
- le16_to_cpu(fe->i_suballoc_bit));
+ else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+ if (di->i_suballoc_loc)
+ ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+ else
+ ac->ac_last_group = ocfs2_which_suballoc_group(
+ le64_to_cpu(di->i_blkno),
+ le16_to_cpu(di->i_suballoc_bit));
+ }
}
static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1682,19 +2063,148 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
}
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_suballoc_result *res;
+
+ BUG_ON(!ac);
+ BUG_ON(ac->ac_bits_given != 0);
+ BUG_ON(ac->ac_bits_wanted != 1);
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+ res = kzalloc(sizeof(*res), GFP_NOFS);
+ if (res == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+ /*
+ * The handle started here is for chain relink. Alternatively,
+ * we could just disable relink for these calls.
+ */
+ handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * This will instruct ocfs2_claim_suballoc_bits and
+ * ocfs2_search_one_group to search but save actual allocation
+ * for later.
+ */
+ ac->ac_find_loc_only = 1;
+
+ ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac->ac_find_loc_priv = res;
+ *fe_blkno = res->sr_blkno;
+ ocfs2_update_inode_fsync_trans(handle, dir, 0);
+out:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+ if (ret)
+ kfree(res);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno)
+{
+ int ret;
+ u16 chain;
+ struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+ struct buffer_head *bg_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+ /*
+ * Since di_blkno is being passed back in, we check for any
+ * inconsistencies which may have happened between
+ * calls. These are code bugs as di_blkno is not expected to
+ * change once returned from ocfs2_find_new_inode_loc()
+ */
+ BUG_ON(res->sr_blkno != di_blkno);
+
+ ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+ res->sr_bg_stable_blkno, &bg_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ chain = le16_to_cpu(bg->bg_chain);
+
+ ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle,
+ ac->ac_inode,
+ bg,
+ bg_bh,
+ res->sr_bit_offset,
+ res->sr_bits);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+ ac->ac_bh, res->sr_bits, chain);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+ res->sr_bits);
+
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+ BUG_ON(res->sr_bits != 1);
+
+ *suballoc_loc = res->sr_bg_blkno;
+ *suballoc_bit = res->sr_bit_offset;
+ ac->ac_bits_given++;
+ ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+ brelse(bg_bh);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno)
{
int status;
- unsigned int num_bits;
- u64 bg_blkno;
-
- mlog_entry_void();
+ struct ocfs2_suballoc_result res;
BUG_ON(!ac);
BUG_ON(ac->ac_bits_given != 0);
@@ -1703,28 +2213,28 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
1,
1,
- suballoc_bit,
- &num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- BUG_ON(num_bits != 1);
+ BUG_ON(res.sr_bits != 1);
- *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit = res.sr_bit_offset;
+ *fe_blkno = res.sr_blkno;
ac->ac_bits_given++;
ocfs2_save_inode_ac_group(dir, ac);
status = 0;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1789,8 +2299,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
* contig. allocation, set to '1' to indicate we can deal with extents
* of any size.
*/
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
@@ -1799,10 +2308,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
{
int status;
unsigned int bits_wanted = max_clusters;
- u64 bg_blkno = 0;
- u16 bg_bit_off;
-
- mlog_entry_void();
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+ struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
@@ -1810,6 +2317,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
&& ac->ac_which != OCFS2_AC_USE_MAIN);
if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+ WARN_ON(min_clusters > 1);
+
status = ocfs2_claim_local_alloc_bits(osb,
handle,
ac,
@@ -1832,20 +2341,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
if (bits_wanted > (osb->bitmap_cpg - 1))
bits_wanted = osb->bitmap_cpg - 1;
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
bits_wanted,
min_clusters,
- &bg_bit_off,
- num_clusters,
- &bg_blkno);
+ &res);
if (!status) {
+ BUG_ON(res.sr_blkno); /* cluster alloc can't set */
*cluster_start =
ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
- bg_blkno,
- bg_bit_off);
+ res.sr_bg_blkno,
+ res.sr_bit_offset);
atomic_inc(&osb->alloc_stats.bitmap_data);
+ *num_clusters = res.sr_bits;
}
}
if (status < 0) {
@@ -1857,12 +2365,12 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
ac->ac_bits_given += *num_clusters;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
@@ -1870,56 +2378,69 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
{
unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
- return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+ return __ocfs2_claim_clusters(handle, ac, min_clusters,
bits_wanted, cluster_start, num_clusters);
}
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits)
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bmap))
{
int status;
unsigned int tmp;
- int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
struct ocfs2_group_desc *undo_bg = NULL;
- mlog_entry_void();
-
/* The caller got this descriptor from
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
- mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
-
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+ trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
- status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
- journal_type);
+ BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
+ status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
+ group_bh,
+ undo_fn ?
+ OCFS2_JOURNAL_ACCESS_UNDO :
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
+ if (undo_fn) {
+ jbd_lock_bh_state(group_bh);
+ undo_bg = (struct ocfs2_group_desc *)
+ bh2jh(group_bh)->b_committed_data;
+ BUG_ON(!undo_bg);
+ }
tmp = num_bits;
while(tmp--) {
ocfs2_clear_bit((bit_off + tmp),
(unsigned long *) bg->bg_bitmap);
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- ocfs2_set_bit(bit_off + tmp,
- (unsigned long *) undo_bg->bg_bitmap);
+ if (undo_fn)
+ undo_fn(bit_off + tmp,
+ (unsigned long *) undo_bg->bg_bitmap);
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
- status = ocfs2_journal_dirty(handle, group_bh);
- if (status < 0)
- mlog_errno(status);
+ if (undo_fn)
+ jbd_unlock_bh_state(group_bh);
+
+ ocfs2_journal_dirty(handle, group_bh);
bail:
return status;
}
@@ -1927,12 +2448,14 @@ bail:
/*
* expects the suballoc inode to already be locked.
*/
-int ocfs2_free_suballoc_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct buffer_head *alloc_bh,
- unsigned int start_bit,
- u64 bg_blkno,
- unsigned int count)
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bitmap))
{
int status = 0;
u32 tmp_used;
@@ -1941,19 +2464,18 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group;
- mlog_entry_void();
-
/* The alloc_bh comes from ocfs2_free_dinode() or
* ocfs2_free_clusters(). The callers have all locked the
* allocator and gotten alloc_bh from the lock call. This
- * validates the dinode buffer. Any corruption that has happended
+ * validates the dinode buffer. Any corruption that has happened
* is a code bug. */
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
- mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
- (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
- (unsigned long long)bg_blkno, start_bit);
+ trace_ocfs2_free_suballoc_bits(
+ (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+ (unsigned long long)bg_blkno,
+ start_bit, count);
status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
&group_bh);
@@ -1967,14 +2489,14 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
group, group_bh,
- start_bit, count);
+ start_bit, count, undo_fn);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
+ alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1984,20 +2506,27 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
count);
tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
-
- status = ocfs2_journal_dirty(handle, alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, alloc_bh);
bail:
brelse(group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count)
+{
+ return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+ start_bit, bg_blkno, count, NULL);
+}
+
int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
@@ -2007,15 +2536,19 @@ int ocfs2_free_dinode(handle_t *handle,
u16 bit = le16_to_cpu(di->i_suballoc_bit);
u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (di->i_suballoc_loc)
+ bg_blkno = le64_to_cpu(di->i_suballoc_loc);
return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
inode_alloc_bh, bit, bg_blkno, 1);
}
-int ocfs2_free_clusters(handle_t *handle,
- struct inode *bitmap_inode,
- struct buffer_head *bitmap_bh,
- u64 start_blk,
- unsigned int num_clusters)
+static int _ocfs2_free_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bitmap))
{
int status;
u16 bg_start_bit;
@@ -2024,11 +2557,8 @@ int ocfs2_free_clusters(handle_t *handle,
/* You can't ever have a contiguous set of clusters
* bigger than a block group bitmap so we never have to worry
- * about looping on them. */
-
- mlog_entry_void();
-
- /* This is expensive. We can safely remove once this stuff has
+ * about looping on them.
+ * This is expensive. We can safely remove once this stuff has
* gotten tested really well. */
BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
@@ -2037,14 +2567,13 @@ int ocfs2_free_clusters(handle_t *handle,
ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
&bg_start_bit);
- mlog(0, "want to free %u clusters starting at block %llu\n",
- num_clusters, (unsigned long long)start_blk);
- mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
- (unsigned long long)bg_blkno, bg_start_bit);
+ trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+ (unsigned long long)start_blk,
+ bg_start_bit, num_clusters);
- status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
- bg_start_bit, bg_blkno,
- num_clusters);
+ status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+ bg_start_bit, bg_blkno,
+ num_clusters, undo_fn);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -2054,10 +2583,37 @@ int ocfs2_free_clusters(handle_t *handle,
num_clusters);
out:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_free_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+ start_blk, num_clusters,
+ _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap. We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+ start_blk, num_clusters,
+ _ocfs2_clear_bit);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
{
printk("Block Group:\n");
@@ -2131,7 +2687,7 @@ int ocfs2_lock_allocators(struct inode *inode,
BUG_ON(clusters_to_add != 0 && data_ac == NULL);
- num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+ num_free_extents = ocfs2_num_free_extents(osb, et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
@@ -2191,13 +2747,14 @@ out:
* suballoc_bit.
*/
static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
- u16 *suballoc_slot, u16 *suballoc_bit)
+ u16 *suballoc_slot, u64 *group_blkno,
+ u16 *suballoc_bit)
{
int status;
struct buffer_head *inode_bh = NULL;
struct ocfs2_dinode *inode_fe;
- mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
+ trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
/* dirty read disk */
status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2228,11 +2785,14 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
if (suballoc_bit)
*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+ if (group_blkno)
+ *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
bail:
brelse(inode_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2245,29 +2805,31 @@ bail:
*/
static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
struct inode *suballoc,
- struct buffer_head *alloc_bh, u64 blkno,
+ struct buffer_head *alloc_bh,
+ u64 group_blkno, u64 blkno,
u16 bit, int *res)
{
- struct ocfs2_dinode *alloc_fe;
+ struct ocfs2_dinode *alloc_di;
struct ocfs2_group_desc *group;
struct buffer_head *group_bh = NULL;
u64 bg_blkno;
int status;
- mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
- (unsigned int)bit);
+ trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+ (unsigned int)bit);
- alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
- if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+ alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+ if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
(unsigned int)bit,
- ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+ ocfs2_bits_per_group(&alloc_di->id2.i_chain));
status = -EINVAL;
goto bail;
}
- bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
- status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+ bg_blkno = group_blkno ? group_blkno :
+ ocfs2_which_suballoc_group(blkno, bit);
+ status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
&group_bh);
if (status < 0) {
mlog(ML_ERROR, "read group %llu failed %d\n",
@@ -2281,7 +2843,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
bail:
brelse(group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2301,14 +2864,15 @@ bail:
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
{
int status;
+ u64 group_blkno = 0;
u16 suballoc_bit = 0, suballoc_slot = 0;
struct inode *inode_alloc_inode;
struct buffer_head *alloc_bh = NULL;
- mlog_entry("blkno: %llu", (unsigned long long)blkno);
+ trace_ocfs2_test_inode_bit((unsigned long long)blkno);
status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
- &suballoc_bit);
+ &group_blkno, &suballoc_bit);
if (status < 0) {
mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
goto bail;
@@ -2330,13 +2894,14 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
if (status < 0) {
mutex_unlock(&inode_alloc_inode->i_mutex);
+ iput(inode_alloc_inode);
mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
(u32)suballoc_slot, status);
goto bail;
}
status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
- blkno, suballoc_bit, res);
+ group_blkno, blkno, suballoc_bit, res);
if (status < 0)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
@@ -2346,6 +2911,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
iput(inode_alloc_inode);
brelse(alloc_bh);
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a4316..2d2501767c0 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
#ifndef _CHAINALLOC_H_
#define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
u32, /* bits_wanted */
u32, /* min_bits */
u64, /* max_block */
- u16 *, /* *bit_off */
- u16 *); /* *bits_found */
+ struct ocfs2_suballoc_result *);
+ /* found bits */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -48,14 +49,20 @@ struct ocfs2_alloc_context {
/* these are used by the chain search */
u16 ac_chain;
- int ac_allow_chain_relink;
+ int ac_disable_chain_relink;
group_search_t *ac_group_search;
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
is the same as ~0 - unlimited */
+
+ int ac_find_loc_only; /* hack for reflink operation ordering */
+ struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
+ struct ocfs2_alloc_reservation *ac_resv;
};
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
{
@@ -79,22 +86,37 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits);
+
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
u32 *num_bits,
u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
@@ -103,8 +125,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
* Use this variant of ocfs2_claim_clusters to specify a maxiumum
* number of clusters smaller than the allocation reserved.
*/
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
@@ -126,6 +147,11 @@ int ocfs2_free_clusters(handle_t *handle,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters);
static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
{
@@ -190,4 +216,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno);
+
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c6163f5503..ddb662b3244 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
-#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/statfs.h>
@@ -42,8 +41,11 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/quotaops.h>
+#include <linux/cleancache.h>
+
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -52,6 +54,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
@@ -65,26 +68,28 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "ver.h"
#include "xattr.h"
#include "quota.h"
+#include "refcounttree.h"
+#include "suballoc.h"
#include "buffer_head_io.h"
-static struct kmem_cache *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several differnt types of work which
+/* OCFS2 needs to schedule several different types of work which
* require cluster locking, disk I/O, recovery waits, etc. Since these
* types of work tend to be heavy we avoid using the kernel events
* workqueue and schedule on our own. */
struct workqueue_struct *ocfs2_wq = NULL;
-static struct dentry *ocfs2_debugfs_root = NULL;
+static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
struct mount_options
{
@@ -92,14 +97,18 @@ struct mount_options
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
- unsigned int localalloc_opt;
+ int localalloc_opt;
+ unsigned int resv_level;
+ int dir_resv_level;
char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
static int ocfs2_parse_options(struct super_block *sb, char *options,
struct mount_options *mopt,
int is_remount);
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
+static int ocfs2_check_set_options(struct super_block *sb,
+ struct mount_options *options);
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -118,15 +127,16 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
static int ocfs2_check_volume(struct ocfs2_super *osb);
static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct buffer_head *bh,
- u32 sectsize);
+ u32 sectsize,
+ struct ocfs2_blockcheck_stats *stats);
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
- int sector_size);
+ int sector_size,
+ struct ocfs2_blockcheck_stats *stats);
static int ocfs2_get_sector(struct super_block *sb,
struct buffer_head **bh,
int block,
int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
static struct inode *ocfs2_alloc_inode(struct super_block *sb);
static void ocfs2_destroy_inode(struct inode *inode);
static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -138,10 +148,8 @@ static const struct super_operations ocfs2_sops = {
.alloc_inode = ocfs2_alloc_inode,
.destroy_inode = ocfs2_destroy_inode,
.drop_inode = ocfs2_drop_inode,
- .clear_inode = ocfs2_clear_inode,
- .delete_inode = ocfs2_delete_inode,
+ .evict_inode = ocfs2_evict_inode,
.sync_fs = ocfs2_sync_fs,
- .write_super = ocfs2_write_super,
.put_super = ocfs2_put_super,
.remount_fs = ocfs2_remount,
.show_options = ocfs2_show_options,
@@ -157,6 +165,7 @@ enum {
Opt_nointr,
Opt_hb_none,
Opt_hb_local,
+ Opt_hb_global,
Opt_data_ordered,
Opt_data_writeback,
Opt_atime_quantum,
@@ -172,6 +181,10 @@ enum {
Opt_noacl,
Opt_usrquota,
Opt_grpquota,
+ Opt_coherency_buffered,
+ Opt_coherency_full,
+ Opt_resv_level,
+ Opt_dir_resv_level,
Opt_err,
};
@@ -183,6 +196,7 @@ static const match_table_t tokens = {
{Opt_nointr, "nointr"},
{Opt_hb_none, OCFS2_HB_NONE},
{Opt_hb_local, OCFS2_HB_LOCAL},
+ {Opt_hb_global, OCFS2_HB_GLOBAL},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
@@ -198,16 +212,20 @@ static const match_table_t tokens = {
{Opt_noacl, "noacl"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
+ {Opt_coherency_buffered, "coherency=buffered"},
+ {Opt_coherency_full, "coherency=full"},
+ {Opt_resv_level, "resv_level=%u"},
+ {Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
};
#ifdef CONFIG_DEBUG_FS
static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
{
- int out = 0;
- int i;
struct ocfs2_cluster_connection *cconn = osb->cconn;
struct ocfs2_recovery_map *rm = osb->recovery_map;
+ struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
+ int i, out = 0;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -232,20 +250,24 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
"%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount",
osb->s_mount_opt, osb->s_atime_quantum);
- out += snprintf(buf + out, len - out,
- "%10s => Stack: %s Name: %*s Version: %d.%d\n",
- "Cluster",
- (*osb->osb_cluster_stack == '\0' ?
- "o2cb" : osb->osb_cluster_stack),
- cconn->cc_namelen, cconn->cc_name,
- cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+ if (cconn) {
+ out += snprintf(buf + out, len - out,
+ "%10s => Stack: %s Name: %*s "
+ "Version: %d.%d\n", "Cluster",
+ (*osb->osb_cluster_stack == '\0' ?
+ "o2cb" : osb->osb_cluster_stack),
+ cconn->cc_namelen, cconn->cc_name,
+ cconn->cc_version.pv_major,
+ cconn->cc_version.pv_minor);
+ }
spin_lock(&osb->dc_task_lock);
out += snprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
- task_pid_nr(osb->dc_task), osb->blocked_lock_count,
- osb->dc_wake_sequence, osb->dc_work_sequence);
+ (osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
+ osb->blocked_lock_count, osb->dc_wake_sequence,
+ osb->dc_work_sequence);
spin_unlock(&osb->dc_task_lock);
spin_lock(&osb->osb_lock);
@@ -264,15 +286,15 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
spin_unlock(&osb->osb_lock);
out += snprintf(buf + out, len - out,
- "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
- task_pid_nr(osb->commit_task), osb->osb_commit_interval,
- atomic_read(&osb->needs_checkpoint));
+ "%10s => Pid: %d Interval: %lu\n", "Commit",
+ (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
+ osb->osb_commit_interval);
out += snprintf(buf + out, len - out,
- "%10s => State: %d NumTxns: %d TxnId: %lu\n",
+ "%10s => State: %d TxnId: %lu NumTxns: %d\n",
"Journal", osb->journal->j_state,
- atomic_read(&osb->journal->j_num_trans),
- osb->journal->j_trans_id);
+ osb->journal->j_trans_id,
+ atomic_read(&osb->journal->j_num_trans));
out += snprintf(buf + out, len - out,
"%10s => GlobalAllocs: %d LocalAllocs: %d "
@@ -293,14 +315,26 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
spin_lock(&osb->osb_lock);
out += snprintf(buf + out, len - out,
- "%10s => Slot: %d NumStolen: %d\n", "Steal",
+ "%10s => InodeSlot: %d StolenInodes: %d, "
+ "MetaSlot: %d StolenMeta: %d\n", "Steal",
osb->s_inode_steal_slot,
- atomic_read(&osb->s_num_inodes_stolen));
+ atomic_read(&osb->s_num_inodes_stolen),
+ osb->s_meta_steal_slot,
+ atomic_read(&osb->s_num_meta_stolen));
spin_unlock(&osb->osb_lock);
+ out += snprintf(buf + out, len - out, "OrphanScan => ");
+ out += snprintf(buf + out, len - out, "Local: %u Global: %u ",
+ os->os_count, os->os_seqno);
+ out += snprintf(buf + out, len - out, " Last Scan: ");
+ if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
+ out += snprintf(buf + out, len - out, "Disabled\n");
+ else
+ out += snprintf(buf + out, len - out, "%lu seconds ago\n",
+ (get_seconds() - os->os_scantime.tv_sec));
+
out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
"Slots", "Num", "RecoGen");
-
for (i = 0; i < osb->max_slots; ++i) {
out += snprintf(buf + out, len - out,
"%10s %c %3d %10d\n",
@@ -358,31 +392,19 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
}
#endif /* CONFIG_DEBUG_FS */
-static struct file_operations ocfs2_osb_debug_fops = {
+static const struct file_operations ocfs2_osb_debug_fops = {
.open = ocfs2_osb_debug_open,
.release = ocfs2_debug_release,
.read = ocfs2_debug_read,
.llseek = generic_file_llseek,
};
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
-{
- if (mutex_trylock(&sb->s_lock) != 0)
- BUG();
- sb->s_dirt = 0;
-}
-
static int ocfs2_sync_fs(struct super_block *sb, int wait)
{
int status;
tid_t target;
struct ocfs2_super *osb = OCFS2_SB(sb);
- sb->s_dirt = 0;
-
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
@@ -422,8 +444,6 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
int status = 0;
int i;
- mlog_entry_void();
-
new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
@@ -459,7 +479,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -469,8 +490,6 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
int status = 0;
int i;
- mlog_entry_void();
-
for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
i < NUM_SYSTEM_INODES;
i++) {
@@ -489,7 +508,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -498,13 +518,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
int i;
struct inode *inode;
- mlog_entry_void();
-
- for (i = 0; i < NUM_SYSTEM_INODES; i++) {
- inode = osb->system_inodes[i];
+ for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+ inode = osb->global_system_inodes[i];
if (inode) {
iput(inode);
- osb->system_inodes[i] = NULL;
+ osb->global_system_inodes[i] = NULL;
}
}
@@ -520,7 +538,18 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
osb->root_inode = NULL;
}
- mlog_exit(0);
+ if (!osb->local_system_inodes)
+ return;
+
+ for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+ if (osb->local_system_inodes[i]) {
+ iput(osb->local_system_inodes[i]);
+ osb->local_system_inodes[i] = NULL;
+ }
+ }
+
+ kfree(osb->local_system_inodes);
+ osb->local_system_inodes = NULL;
}
/* We're allocating fs objects, use GFP_NOFS */
@@ -532,15 +561,24 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
if (!oi)
return NULL;
+ oi->i_sync_tid = 0;
+ oi->i_datasync_tid = 0;
+
jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
}
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
unsigned int cbits)
{
@@ -555,7 +593,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
*/
#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
BUILD_BUG_ON(sizeof(sector_t) != 8);
/*
* We might be limited by page cache size.
@@ -594,14 +632,19 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
int ret = 0;
struct mount_options parsed_options;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ u32 tmp;
+
+ sync_filesystem(sb);
- if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+ if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+ !ocfs2_check_set_options(sb, &parsed_options)) {
ret = -EINVAL;
goto out;
}
- if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
- (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE;
+ if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
goto out;
@@ -641,12 +684,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
}
if (*flags & MS_RDONLY) {
- mlog(0, "Going to ro mode.\n");
sb->s_flags |= MS_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
} else {
- mlog(0, "Making ro filesystem writeable.\n");
-
if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
mlog(ML_ERROR, "Cannot remount RDWR "
"filesystem due to previous errors.\n");
@@ -664,6 +704,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
}
+ trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
unlock_osb:
spin_unlock(&osb->osb_lock);
/* Enable quota accounting after remounting RW */
@@ -686,8 +727,6 @@ unlock_osb:
if (!ret) {
/* Only save off the new mount options in case of a successful
* remount. */
- if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
- parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
@@ -696,6 +735,10 @@ unlock_osb:
if (!ocfs2_is_hard_readonly(osb))
ocfs2_set_journal_params(osb);
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+ MS_POSIXACL : 0);
}
out:
return ret;
@@ -703,7 +746,8 @@ out:
static int ocfs2_sb_probe(struct super_block *sb,
struct buffer_head **bh,
- int *sector_size)
+ int *sector_size,
+ struct ocfs2_blockcheck_stats *stats)
{
int status, tmpstat;
struct ocfs1_vol_disk_hdr *hdr;
@@ -766,16 +810,20 @@ static int ocfs2_sb_probe(struct super_block *sb,
if (tmpstat < 0) {
status = tmpstat;
mlog_errno(status);
- goto bail;
+ break;
}
di = (struct ocfs2_dinode *) (*bh)->b_data;
- status = ocfs2_verify_volume(di, *bh, blksize);
- if (status >= 0)
- goto bail;
- brelse(*bh);
- *bh = NULL;
- if (status != -EAGAIN)
+ memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+ spin_lock_init(&stats->b_lock);
+ tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
+ if (tmpstat < 0) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ if (tmpstat != -EAGAIN) {
+ status = tmpstat;
break;
+ }
}
bail:
@@ -784,23 +832,29 @@ bail:
static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
{
- if (ocfs2_mount_local(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+ if (osb->s_mount_opt & hb_enabled) {
+ if (ocfs2_mount_local(osb)) {
mlog(ML_ERROR, "Cannot heartbeat on a locally "
"mounted device.\n");
return -EINVAL;
}
- }
-
- if (ocfs2_userspace_stack(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ if (ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Userspace stack expected, but "
"o2cb heartbeat arguments passed to mount\n");
return -EINVAL;
}
+ if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+ !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+ ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+ ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+ mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+ return -EINVAL;
+ }
}
- if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ if (!(osb->s_mount_opt & hb_enabled)) {
if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
!ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -853,13 +907,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
if (unsuspend)
- status = vfs_quota_enable(
- sb_dqopt(sb)->files[type],
- type, QFMT_OCFS2,
- DQUOT_SUSPENDED);
- else
- status = vfs_quota_disable(sb, type,
- DQUOT_SUSPENDED);
+ status = dquot_resume(sb, type);
+ else {
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ /* Cancel periodic syncing before suspending */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ status = dquot_suspend(sb, type);
+ }
if (status < 0)
break;
}
@@ -890,8 +946,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
status = -ENOENT;
goto out_quota_off;
}
- status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
- DQUOT_USAGE_ENABLED);
+ status = dquot_enable(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
if (status < 0)
goto out_quota_off;
}
@@ -912,18 +968,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
int type;
struct inode *inode;
struct super_block *sb = osb->sb;
+ struct ocfs2_mem_dqinfo *oinfo;
/* We mostly ignore errors in this function because there's not much
* we can do when we see them */
for (type = 0; type < MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
+ /* Cancel periodic syncing before we grab dqonoff_mutex */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
inode = igrab(sb->s_dquot.files[type]);
/* Turn off quotas. This will remove all dquot structures from
* memory and so they will be automatically synced to global
* quota files */
- vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
- DQUOT_LIMITS_ENABLED);
+ dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
if (!inode)
continue;
iput(inode);
@@ -931,8 +991,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
}
/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
- char *path, int remount)
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
{
unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -940,30 +999,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
return -EINVAL;
- if (remount)
- return 0; /* Just ignore it has been handled in
- * ocfs2_remount() */
- return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
- format_id, DQUOT_LIMITS_ENABLED);
+ return dquot_enable(sb_dqopt(sb)->files[type], type,
+ format_id, DQUOT_LIMITS_ENABLED);
}
/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
{
- if (remount)
- return 0; /* Ignore now and handle later in
- * ocfs2_remount() */
- return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
-static struct quotactl_ops ocfs2_quotactl_ops = {
- .quota_on = ocfs2_quota_on,
+static const struct quotactl_ops ocfs2_quotactl_ops = {
+ .quota_on_meta = ocfs2_quota_on,
.quota_off = ocfs2_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
};
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -974,9 +1027,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode = NULL;
struct ocfs2_super *osb = NULL;
struct buffer_head *bh = NULL;
- char nodestr[8];
+ char nodestr[12];
+ struct ocfs2_blockcheck_stats stats;
- mlog_entry("%p, %p, %i", sb, data, silent);
+ trace_ocfs2_fill_super(sb, data, silent);
if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
status = -EINVAL;
@@ -984,13 +1038,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
/* probe for superblock */
- status = ocfs2_sb_probe(sb, &bh, &sector_size);
+ status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
if (status < 0) {
mlog(ML_ERROR, "superblock probe failed!\n");
goto read_super_error;
}
- status = ocfs2_initialize_super(sb, bh, sector_size);
+ status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
osb = OCFS2_SB(sb);
if (status < 0) {
mlog_errno(status);
@@ -999,31 +1053,22 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
bh = NULL;
- if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
- parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
-
+ if (!ocfs2_check_set_options(sb, &parsed_options)) {
+ status = -EINVAL;
+ goto read_super_error;
+ }
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
- osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
- osb->local_alloc_bits = osb->local_alloc_default_bits;
- if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
- !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
- status = -EINVAL;
- mlog(ML_ERROR, "User quotas were requested, but this "
- "filesystem does not have the feature enabled.\n");
- goto read_super_error;
- }
- if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
- !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
- status = -EINVAL;
- mlog(ML_ERROR, "Group quotas were requested, but this "
- "filesystem does not have the feature enabled.\n");
- goto read_super_error;
- }
+
+ ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+ osb->osb_resv_level = parsed_options.resv_level;
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ if (parsed_options.dir_resv_level == -1)
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ else
+ osb->osb_dir_resv_level = parsed_options.dir_resv_level;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -1031,7 +1076,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = OCFS2_SUPER_MAGIC;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -1060,15 +1105,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
"file system, but write access is "
"unavailable.\n");
else
- mlog_errno(status);
+ mlog_errno(status);
goto read_super_error;
}
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
@@ -1100,20 +1145,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- status = ocfs2_mount_volume(sb);
- if (osb->root_inode)
- inode = igrab(osb->root_inode);
+ if (ocfs2_meta_ecc(osb)) {
+ status = ocfs2_blockcheck_stats_debugfs_install(
+ &osb->osb_ecc_stats,
+ osb->osb_debug_root);
+ if (status) {
+ mlog(ML_ERROR,
+ "Unable to create blockcheck statistics "
+ "files\n");
+ goto read_super_error;
+ }
+ }
+ status = ocfs2_mount_volume(sb);
if (status < 0)
goto read_super_error;
+ if (osb->root_inode)
+ inode = igrab(osb->root_inode);
+
if (!inode) {
status = -EIO;
mlog_errno(status);
goto read_super_error;
}
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
mlog_errno(status);
@@ -1149,7 +1206,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog_errno(status);
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
- mlog_exit(status);
return status;
}
}
@@ -1160,64 +1216,96 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
wake_up(&osb->osb_mount_event);
- mlog_exit(status);
+ /* Start this when the mount is almost sure of being successful */
+ ocfs2_orphan_scan_start(osb);
+
return status;
read_super_error:
brelse(bh);
- if (inode)
- iput(inode);
-
if (osb) {
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
ocfs2_dismount_volume(sb, 1);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
int flags,
const char *dev_name,
- void *data,
- struct vfsmount *mnt)
+ void *data)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
- mnt);
+ return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
}
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
- .get_sb = ocfs2_get_sb, /* is this called when we mount
- * the fs? */
- .kill_sb = kill_block_super, /* set to the generic one
- * right now, but do we
- * need to change that? */
+ .mount = ocfs2_mount,
+ .kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
.next = NULL
};
+MODULE_ALIAS_FS("ocfs2");
+
+static int ocfs2_check_set_options(struct super_block *sb,
+ struct mount_options *options)
+{
+ if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ mlog(ML_ERROR, "User quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ return 0;
+ }
+ if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ mlog(ML_ERROR, "Group quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ return 0;
+ }
+ if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+ !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+ mlog(ML_ERROR, "ACL support requested but extended attributes "
+ "feature is not enabled\n");
+ return 0;
+ }
+ /* No ACL setting specified? Use XATTR feature... */
+ if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+ OCFS2_MOUNT_NO_POSIX_ACL))) {
+ if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+ options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ else
+ options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+ }
+ return 1;
+}
static int ocfs2_parse_options(struct super_block *sb,
char *options,
struct mount_options *mopt,
int is_remount)
{
- int status;
+ int status, user_stack = 0;
char *p;
+ u32 tmp;
- mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
- options ? options : "(none)");
+ trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
mopt->commit_interval = 0;
- mopt->mount_opt = 0;
+ mopt->mount_opt = OCFS2_MOUNT_NOINTR;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
- mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ mopt->localalloc_opt = -1;
mopt->cluster_stack[0] = '\0';
+ mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+ mopt->dir_resv_level = -1;
if (!options) {
status = 1;
@@ -1237,7 +1325,10 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
break;
case Opt_hb_none:
- mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+ mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+ break;
+ case Opt_hb_global:
+ mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
break;
case Opt_barrier:
if (match_int(&args[0], &option)) {
@@ -1308,7 +1399,7 @@ static int ocfs2_parse_options(struct super_block *sb,
status = 0;
goto bail;
}
- if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+ if (option >= 0)
mopt->localalloc_opt = option;
break;
case Opt_localflocks:
@@ -1343,45 +1434,61 @@ static int ocfs2_parse_options(struct super_block *sb,
memcpy(mopt->cluster_stack, args[0].from,
OCFS2_STACK_LABEL_LEN);
mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ /*
+ * Open code the memcmp here as we don't have
+ * an osb to pass to
+ * ocfs2_userspace_stack().
+ */
+ if (memcmp(mopt->cluster_stack,
+ OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ user_stack = 1;
break;
case Opt_inode64:
mopt->mount_opt |= OCFS2_MOUNT_INODE64;
break;
case Opt_usrquota:
- /* We check only on remount, otherwise features
- * aren't yet initialized. */
- if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
- mlog(ML_ERROR, "User quota requested but "
- "filesystem feature is not set\n");
- status = 0;
- goto bail;
- }
mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
break;
case Opt_grpquota:
- if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
- mlog(ML_ERROR, "Group quota requested but "
- "filesystem feature is not set\n");
- status = 0;
- goto bail;
- }
mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
break;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ case Opt_coherency_buffered:
+ mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
+ case Opt_coherency_full:
+ mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
case Opt_acl:
mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
break;
case Opt_noacl:
+ mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
break;
-#else
- case Opt_acl:
- case Opt_noacl:
- printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+ case Opt_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->resv_level = option;
+ break;
+ case Opt_dir_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->dir_resv_level = option;
break;
-#endif
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1391,23 +1498,38 @@ static int ocfs2_parse_options(struct super_block *sb,
}
}
+ if (user_stack == 0) {
+ /* Ensure only one heartbeat mode */
+ tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+ OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE);
+ if (hweight32(tmp) != 1) {
+ mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+ status = 0;
+ goto bail;
+ }
+ }
+
status = 1;
bail:
- mlog_exit(status);
return status;
}
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
{
- struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
+ struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
unsigned long opts = osb->s_mount_opt;
unsigned int local_alloc_megs;
- if (opts & OCFS2_MOUNT_HB_LOCAL)
- seq_printf(s, ",_netdev,heartbeat=local");
- else
- seq_printf(s, ",heartbeat=none");
+ if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+ seq_printf(s, ",_netdev");
+ if (opts & OCFS2_MOUNT_HB_LOCAL)
+ seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+ else
+ seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+ } else
+ seq_printf(s, ",%s", OCFS2_HB_NONE);
if (opts & OCFS2_MOUNT_NOINTR)
seq_printf(s, ",nointr");
@@ -1428,15 +1550,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->preferred_slot != OCFS2_INVALID_SLOT)
seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
- if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
- seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+ seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
if (osb->osb_commit_interval)
seq_printf(s, ",commit=%u",
(unsigned) (osb->osb_commit_interval / HZ));
local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
- if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ if (local_alloc_megs != ocfs2_la_default_mb(osb))
seq_printf(s, ",localalloc=%d", local_alloc_megs);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1450,6 +1571,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_GRPQUOTA)
seq_printf(s, ",grpquota");
+ if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+ seq_printf(s, ",coherency=buffered");
+ else
+ seq_printf(s, ",coherency=full");
+
if (opts & OCFS2_MOUNT_NOUSERXATTR)
seq_printf(s, ",nouser_xattr");
else
@@ -1458,12 +1584,16 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_INODE64)
seq_printf(s, ",inode64");
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
if (opts & OCFS2_MOUNT_POSIX_ACL)
seq_printf(s, ",acl");
else
seq_printf(s, ",noacl");
-#endif
+
+ if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+ seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+ if (osb->osb_dir_resv_level != osb->osb_resv_level)
+ seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
return 0;
}
@@ -1472,26 +1602,18 @@ static int __init ocfs2_init(void)
{
int status;
- mlog_entry_void();
-
- ocfs2_print_version();
-
status = init_ocfs2_uptodate_cache();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out1;
status = ocfs2_initialize_mem_caches();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out2;
ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
if (!ocfs2_wq) {
status = -ENOMEM;
- goto leave;
+ goto out3;
}
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
@@ -1500,34 +1622,30 @@ static int __init ocfs2_init(void)
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
- status = ocfs2_quota_setup();
- if (status)
- goto leave;
-
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
-leave:
- if (status < 0) {
- ocfs2_quota_shutdown();
- ocfs2_free_mem_caches();
- exit_ocfs2_uptodate_cache();
- }
-
- mlog_exit(status);
+ if (status < 0)
+ goto out4;
+ status = register_filesystem(&ocfs2_fs_type);
+ if (!status)
+ return 0;
- if (status >= 0) {
- return register_filesystem(&ocfs2_fs_type);
- } else
- return -1;
+ unregister_quota_format(&ocfs2_quota_format);
+out4:
+ destroy_workqueue(ocfs2_wq);
+ debugfs_remove(ocfs2_debugfs_root);
+out3:
+ ocfs2_free_mem_caches();
+out2:
+ exit_ocfs2_uptodate_cache();
+out1:
+ mlog_errno(status);
+ return status;
}
static void __exit ocfs2_exit(void)
{
- mlog_entry_void();
-
- ocfs2_quota_shutdown();
-
if (ocfs2_wq) {
flush_workqueue(ocfs2_wq);
destroy_workqueue(ocfs2_wq);
@@ -1542,18 +1660,14 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type);
exit_ocfs2_uptodate_cache();
-
- mlog_exit_void();
}
static void ocfs2_put_super(struct super_block *sb)
{
- mlog_entry("(0x%p)\n", sb);
+ trace_ocfs2_put_super(sb);
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
-
- mlog_exit_void();
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1565,7 +1679,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct buffer_head *bh = NULL;
struct inode *inode = NULL;
- mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
+ trace_ocfs2_statfs(dentry->d_sb, buf);
osb = OCFS2_SB(dentry->d_sb);
@@ -1599,6 +1713,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
buf->f_files = numbits;
buf->f_ffree = freebits;
+ buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
+ & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
+ OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
brelse(bh);
@@ -1608,7 +1726,8 @@ bail:
if (inode)
iput(inode);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1622,10 +1741,8 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
- oi->ip_created_trans = 0;
- oi->ip_last_trans = 0;
oi->ip_dir_start_lookup = 0;
-
+ mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1633,11 +1750,14 @@ static void ocfs2_inode_init_once(void *data)
oi->ip_blkno = 0ULL;
oi->ip_clusters = 0;
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- ocfs2_metadata_cache_init(&oi->vfs_inode);
+ ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
+ &ocfs2_inode_caching_ops);
inode_init_once(&oi->vfs_inode);
}
@@ -1677,6 +1797,11 @@ static int ocfs2_initialize_mem_caches(void)
static void ocfs2_free_mem_caches(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
if (ocfs2_inode_cachep)
kmem_cache_destroy(ocfs2_inode_cachep);
ocfs2_inode_cachep = NULL;
@@ -1702,8 +1827,8 @@ static int ocfs2_get_sector(struct super_block *sb,
*bh = sb_getblk(sb, block);
if (!*bh) {
- mlog_errno(-EIO);
- return -EIO;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
lock_buffer(*bh);
if (!buffer_dirty(*bh))
@@ -1727,8 +1852,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb))
goto leave;
@@ -1766,19 +1889,13 @@ static int ocfs2_mount_volume(struct super_block *sb)
}
status = ocfs2_truncate_log_init(osb);
- if (status < 0) {
+ if (status < 0)
mlog_errno(status);
- goto leave;
- }
-
- if (ocfs2_mount_local(osb))
- goto leave;
leave:
if (unlock_super)
ocfs2_super_unlock(osb, 1);
- mlog_exit(status);
return status;
}
@@ -1786,9 +1903,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
int tmp, hangup_needed = 0;
struct ocfs2_super *osb = NULL;
- char nodestr[8];
+ char nodestr[12];
- mlog_entry("(0x%p)\n", sb);
+ trace_ocfs2_dismount_volume(sb);
BUG_ON(!sb);
osb = OCFS2_SB(sb);
@@ -1796,8 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
debugfs_remove(osb->osb_ctxt);
+ /* Orphan scan should be stopped as early as possible */
+ ocfs2_orphan_scan_stop(osb);
+
ocfs2_disable_quotas(osb);
+ /* All dquots should be freed by now */
+ WARN_ON(!llist_empty(&osb->dquot_drop_list));
+ /* Wait for worker to be done with the work structure in osb */
+ cancel_work_sync(&osb->dquot_drop_work);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
@@ -1809,6 +1934,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_sync_blockdev(sb);
+ ocfs2_purge_refcount_trees(osb);
+
/* No cluster connection means we've failed during mount, so skip
* all the steps which depended on that to complete. */
if (osb->cconn) {
@@ -1833,12 +1960,14 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
* If we failed before we got a uuid_str yet, we can't stop
* heartbeat. Otherwise, do it.
*/
- if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
if (osb->cconn)
ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
debugfs_remove(osb->osb_debug_root);
if (hangup_needed)
@@ -1884,19 +2013,48 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
return 0;
}
+/* Make sure entire volume is addressable by our journal. Requires
+ osb_clusters_at_boot to be valid and for the journal to have been
+ initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+ int status = 0;
+ u64 max_block =
+ ocfs2_clusters_to_blocks(osb->sb,
+ osb->osb_clusters_at_boot) - 1;
+
+ /* 32-bit block number is always OK. */
+ if (max_block <= (u32)~0ULL)
+ goto out;
+
+ /* Volume is "huge", so see if our journal is new enough to
+ support it. */
+ if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+ jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT))) {
+ mlog(ML_ERROR, "The journal cannot address the entire volume. "
+ "Enable the 'block64' journal option with tunefs.ocfs2");
+ status = -EFBIG;
+ goto out;
+ }
+
+ out:
+ return status;
+}
+
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
- int sector_size)
+ int sector_size,
+ struct ocfs2_blockcheck_stats *stats)
{
int status;
int i, cbits, bbits;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
struct ocfs2_journal *journal;
- __le32 uuid_net_key;
struct ocfs2_super *osb;
-
- mlog_entry_void();
+ u64 total_blocks;
osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
if (!osb) {
@@ -1907,6 +2065,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
+ sb->s_d_op = &ocfs2_dentry_ops;
sb->s_export_op = &ocfs2_export_ops;
sb->s_qcop = &ocfs2_quotactl_ops;
sb->dq_op = &ocfs2_quota_operations;
@@ -1937,7 +2096,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->blocked_lock_count = 0;
spin_lock_init(&osb->osb_lock);
spin_lock_init(&osb->osb_xattr_lock);
- ocfs2_init_inode_steal_slot(osb);
+ ocfs2_init_steal_slots(osb);
+
+ mutex_init(&osb->system_file_mutex);
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1945,11 +2106,24 @@ static int ocfs2_initialize_super(struct super_block *sb,
atomic_set(&osb->alloc_stats.bg_allocs, 0);
atomic_set(&osb->alloc_stats.bg_extends, 0);
+ /* Copy the blockcheck stats from the superblock probe */
+ osb->osb_ecc_stats = *stats;
+
ocfs2_init_node_maps(osb);
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+ if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+ mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+ osb->max_slots);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ ocfs2_orphan_scan_init(osb);
+
status = ocfs2_recovery_init(osb);
if (status) {
mlog(ML_ERROR, "Unable to initialize recovery state\n");
@@ -1958,7 +2132,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
init_waitqueue_head(&osb->checkpoint_event);
- atomic_set(&osb->needs_checkpoint, 0);
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
@@ -1973,6 +2146,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
init_waitqueue_head(&osb->osb_mount_event);
+ status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -1980,15 +2159,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
- if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
- mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
- osb->max_slots);
- status = -EINVAL;
- goto bail;
- }
- mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
osb->slot_recovery_generations =
kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
GFP_KERNEL);
@@ -2008,6 +2178,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
+ osb->osb_rf_lock_tree = RB_ROOT;
+
osb->s_feature_compat =
le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
osb->s_feature_ro_compat =
@@ -2029,11 +2201,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_userspace_stack(osb)) {
- memcpy(osb->osb_cluster_stack,
+ if (ocfs2_clusterinfo_valid(osb)) {
+ osb->osb_stackflags =
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
+ strlcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN);
- osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ OCFS2_STACK_LABEL_LEN + 1);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2042,6 +2215,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
+ strlcpy(osb->osb_cluster_name,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+ OCFS2_CLUSTER_NAME_LEN + 1);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
@@ -2077,14 +2253,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
journal->j_state = OCFS2_JOURNAL_FREE;
- INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
- osb->dentry_lock_list = NULL;
+ INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
+ init_llist_head(&osb->dquot_drop_list);
/* get some pseudo constants for clustersize bits */
osb->s_clustersize_bits =
le32_to_cpu(di->id2.i_super.s_clustersize_bits);
osb->s_clustersize = 1 << osb->s_clustersize_bits;
- mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -2094,11 +2269,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
- > (u32)~0UL) {
- mlog(ML_ERROR, "Volume might try to write to blocks beyond "
- "what jbd can address in 32 bits.\n");
- status = -EINVAL;
+ total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+ le32_to_cpu(di->i_clusters));
+
+ status = generic_check_addressable(osb->sb->s_blocksize_bits,
+ total_blocks);
+ if (status) {
+ mlog(ML_ERROR, "Volume too large "
+ "to mount safely on this system");
+ status = -EFBIG;
goto bail;
}
@@ -2109,21 +2288,18 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-
- strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
- osb->vol_label[63] = '\0';
+ strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno =
le64_to_cpu(di->id2.i_super.s_first_cluster_group);
osb->fs_generation = le32_to_cpu(di->i_fs_generation);
osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
- mlog(0, "vol_label: %s\n", osb->vol_label);
- mlog(0, "uuid: %s\n", osb->uuid_str);
- mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
- (unsigned long long)osb->root_blkno,
- (unsigned long long)osb->system_dir_blkno);
+ trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+ (unsigned long long)osb->root_blkno,
+ (unsigned long long)osb->system_dir_blkno,
+ osb->s_clustersize_bits);
osb->osb_dlm_debug = ocfs2_new_dlm_debug();
if (!osb->osb_dlm_debug) {
@@ -2153,18 +2329,20 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+ osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
iput(inode);
- osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+ osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+ osb->s_feature_incompat) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
bail:
- mlog_exit(status);
return status;
}
@@ -2175,12 +2353,11 @@ bail:
*/
static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct buffer_head *bh,
- u32 blksz)
+ u32 blksz,
+ struct ocfs2_blockcheck_stats *stats)
{
int status = -EAGAIN;
- mlog_entry_void();
-
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
/* We have to do a raw check of the feature here */
@@ -2188,7 +2365,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
OCFS2_FEATURE_INCOMPAT_META_ECC) {
status = ocfs2_block_check_validate(bh->b_data,
bh->b_size,
- &di->i_check);
+ &di->i_check,
+ stats);
if (status)
goto out;
}
@@ -2234,7 +2412,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
}
out:
- mlog_exit(status);
+ if (status && status != -EAGAIN)
+ mlog_errno(status);
return status;
}
@@ -2247,8 +2426,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* recover
* ourselves. */
- mlog_entry_void();
-
/* Init our journal object. */
status = ocfs2_journal_init(osb->journal, &dirty);
if (status < 0) {
@@ -2256,6 +2433,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ /* Now that journal has been initialized, check to make sure
+ entire volume is addressable. */
+ status = ocfs2_journal_addressable(osb);
+ if (status)
+ goto finally;
+
/* If the journal was unmounted cleanly then we don't want to
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
@@ -2266,8 +2449,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
local = ocfs2_mount_local(osb);
@@ -2292,8 +2475,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves as mounted. */
}
- mlog(0, "Journal loaded.\n");
-
status = ocfs2_load_local_alloc(osb);
if (status < 0) {
mlog_errno(status);
@@ -2322,10 +2503,10 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
mlog_errno(status);
finally:
- if (local_alloc)
- kfree(local_alloc);
+ kfree(local_alloc);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2337,8 +2518,6 @@ finally:
*/
static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
- mlog_entry_void();
-
/* This function assumes that the caller has the main osb resource */
ocfs2_free_slot_info(osb);
@@ -2347,17 +2526,14 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->slot_recovery_generations);
/* FIXME
* This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initalize_osb(),
+ * allocate osb->journal at the start of ocfs2_initialize_osb(),
* we free it here.
*/
kfree(osb->journal);
- if (osb->local_alloc_copy)
- kfree(osb->local_alloc_copy);
+ kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
-
- mlog_exit_void();
}
/* Put OCFS2 into a readonly state, or (if the user specifies it),
@@ -2431,9 +2607,30 @@ void __ocfs2_abort(struct super_block* sb,
/* Force a panic(). This stinks, but it's better than letting
* things continue without having a proper hard readonly
* here. */
- OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
+ if (!ocfs2_mount_local(OCFS2_SB(sb)))
+ OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
ocfs2_handle_error(sb);
}
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+ int rc;
+ sigset_t blocked;
+
+ sigfillset(&blocked);
+ rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+ BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+ int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+ BUG_ON(rc);
+}
+
module_init(ocfs2_init);
module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a..74ff74cf78f 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,18 +31,23 @@ extern struct workqueue_struct *ocfs2_wq;
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-void __ocfs2_abort(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110..66edce7ecfd 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,10 +38,8 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
-#include <linux/utsname.h>
#include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -56,106 +54,40 @@
#include "buffer_head_io.h"
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
- struct buffer_head **bh)
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
{
- int status;
- char *link = NULL;
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *bh = NULL;
+ int status = ocfs2_read_inode_block(inode, &bh);
struct ocfs2_dinode *fe;
+ const char *link;
+ void *kaddr;
+ size_t len;
- mlog_entry_void();
-
- status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
- link = ERR_PTR(status);
- goto bail;
+ return status;
}
- fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ fe = (struct ocfs2_dinode *) bh->b_data;
link = (char *) fe->id2.i_symlink;
-bail:
- mlog_exit(status);
-
- return link;
-}
-
-static int ocfs2_readlink(struct dentry *dentry,
- char __user *buffer,
- int buflen)
-{
- int ret;
- char *link;
- struct buffer_head *bh = NULL;
- struct inode *inode = dentry->d_inode;
-
- mlog_entry_void();
-
- link = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(link)) {
- ret = PTR_ERR(link);
- goto out;
- }
-
- /*
- * Without vfsmount we can't update atime now,
- * but we will update atime here ultimately.
- */
- ret = vfs_readlink(dentry, buffer, buflen, link);
-
- brelse(bh);
-out:
- mlog_exit(ret);
- return ret;
-}
-
-static void *ocfs2_fast_follow_link(struct dentry *dentry,
- struct nameidata *nd)
-{
- int status = 0;
- int len;
- char *target, *link = ERR_PTR(-ENOMEM);
- struct inode *inode = dentry->d_inode;
- struct buffer_head *bh = NULL;
-
- mlog_entry_void();
-
- BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
- target = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(target)) {
- status = PTR_ERR(target);
- mlog_errno(status);
- goto bail;
- }
-
- /* Fast symlinks can't be large */
- len = strlen(target);
- link = kzalloc(len + 1, GFP_NOFS);
- if (!link) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- memcpy(link, target, len);
- nd_set_link(nd, link);
-
-bail:
+ /* will be less than a page size */
+ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, link, len + 1);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ unlock_page(page);
brelse(bh);
-
- mlog_exit(status);
- return status ? ERR_PTR(status) : link;
+ return 0;
}
-static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
- char *link = cookie;
-
- kfree(link);
-}
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+ .readpage = ocfs2_fast_symlink_readpage,
+};
const struct inode_operations ocfs2_symlink_inode_operations = {
- .readlink = page_readlink,
+ .readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.getattr = ocfs2_getattr,
@@ -164,15 +96,5 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
.getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
-};
-const struct inode_operations ocfs2_fast_symlink_inode_operations = {
- .readlink = ocfs2_readlink,
- .follow_link = ocfs2_fast_follow_link,
- .put_link = ocfs2_fast_put_link,
- .getattr = ocfs2_getattr,
- .setattr = ocfs2_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
+ .fiemap = ocfs2_fiemap,
};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 65a6c9c6ad5..71ee4245e91 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -27,7 +27,7 @@
#define OCFS2_SYMLINK_H
extern const struct inode_operations ocfs2_symlink_inode_operations;
-extern const struct inode_operations ocfs2_fast_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
/*
* Test whether an inode is a fast symlink.
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index ab713ebdd54..af155c18312 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,10 +25,8 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -45,10 +43,9 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
+#endif
static inline int is_global_system_inode(int type)
{
@@ -56,11 +53,51 @@ static inline int is_global_system_inode(int type)
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
}
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+ int type,
+ u32 slot)
{
- return slot == osb->slot_num || is_global_system_inode(type);
+ int index;
+ struct inode **local_system_inodes, **free = NULL;
+
+ BUG_ON(slot == OCFS2_INVALID_SLOT);
+ BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+ type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+ spin_lock(&osb->osb_lock);
+ local_system_inodes = osb->local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+
+ if (unlikely(!local_system_inodes)) {
+ local_system_inodes = kzalloc(sizeof(struct inode *) *
+ NUM_LOCAL_SYSTEM_INODES *
+ osb->max_slots,
+ GFP_NOFS);
+ if (!local_system_inodes) {
+ mlog_errno(-ENOMEM);
+ /*
+ * return NULL here so that ocfs2_get_sytem_file_inodes
+ * will try to create an inode and use it. We will try
+ * to initialize local_system_inodes next time.
+ */
+ return NULL;
+ }
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_system_inodes) {
+ /* Someone has initialized it for us. */
+ free = local_system_inodes;
+ local_system_inodes = osb->local_system_inodes;
+ } else
+ osb->local_system_inodes = local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+ kfree(free);
+ }
+
+ index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+ (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+ return &local_system_inodes[index];
}
struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -71,12 +108,16 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
- if (is_in_system_inode_array(osb, type, slot))
- arr = &(osb->system_inodes[type]);
+ if (is_global_system_inode(type)) {
+ arr = &(osb->global_system_inodes[type]);
+ } else
+ arr = get_local_system_inode(osb, type, slot);
+ mutex_lock(&osb->system_file_mutex);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
inode = igrab(inode);
+ mutex_unlock(&osb->system_file_mutex);
BUG_ON(!inode);
return inode;
@@ -90,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
*arr = igrab(inode);
BUG_ON(!*arr);
}
+ mutex_unlock(&osb->system_file_mutex);
return inode;
}
@@ -118,6 +160,21 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
inode = NULL;
goto bail;
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
+ type == LOCAL_GROUP_QUOTA_SYSTEM_INODE ||
+ type == JOURNAL_SYSTEM_INODE) {
+ /* Ignore inode lock on these inodes as the lock does not
+ * really belong to any process and lockdep cannot handle
+ * that */
+ OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL;
+ } else {
+ lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres.
+ l_lockdep_map,
+ ocfs2_system_inodes[type].si_name,
+ &ocfs2_sysfile_cluster_lock_key[type], 0);
+ }
+#endif
bail:
return inode;
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 187b99ff036..82e17b076ce 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,13 +53,6 @@
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-#endif
-
-#define MLOG_MASK_PREFIX ML_UPTODATE
#include <cluster/masklog.h>
@@ -67,23 +60,86 @@
#include "inode.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
struct ocfs2_meta_cache_item {
struct rb_node c_node;
sector_t c_block;
};
-static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep;
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ return ci->ci_ops->co_owner(ci);
+}
+
+struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ return ci->ci_ops->co_get_super(ci);
+}
+
+static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_cache_lock(ci);
+}
+
+static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_cache_unlock(ci);
+}
+
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ci->ci_ops->co_io_lock(ci);
+}
-void ocfs2_metadata_cache_init(struct inode *inode)
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
+ BUG_ON(!ci || !ci->ci_ops);
- oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
+ ci->ci_ops->co_io_unlock(ci);
+}
+
+
+static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
+ int clear)
+{
+ ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
ci->ci_num_cached = 0;
+
+ if (clear) {
+ ci->ci_created_trans = 0;
+ ci->ci_last_trans = 0;
+ }
}
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+ const struct ocfs2_caching_operations *ops)
+{
+ BUG_ON(!ops);
+
+ ci->ci_ops = ops;
+ ocfs2_metadata_cache_reset(ci, 1);
+}
+
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
+{
+ ocfs2_metadata_cache_purge(ci);
+ ocfs2_metadata_cache_reset(ci, 1);
+}
+
+
/* No lock taken here as 'root' is not expected to be visible to other
* processes. */
static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
@@ -95,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
while ((node = rb_last(root)) != NULL) {
item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
- mlog(0, "Purge item %llu\n",
- (unsigned long long) item->c_block);
+ trace_ocfs2_purge_copied_metadata_tree(
+ (unsigned long long) item->c_block);
rb_erase(&item->c_node, root);
kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -112,19 +168,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
* This function is a few more lines longer than necessary due to some
* accounting done here, but I think it's worth tracking down those
* bugs sooner -- Mark */
-void ocfs2_metadata_cache_purge(struct inode *inode)
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
unsigned int tree, to_purge, purged;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct rb_root root = RB_ROOT;
- spin_lock(&oi->ip_lock);
- tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ BUG_ON(!ci || !ci->ci_ops);
+
+ ocfs2_metadata_cache_lock(ci);
+ tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
to_purge = ci->ci_num_cached;
- mlog(0, "Purge %u %s items from Inode %llu\n", to_purge,
- tree ? "array" : "tree", (unsigned long long)oi->ip_blkno);
+ trace_ocfs2_metadata_cache_purge(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ to_purge, tree);
/* If we're a tree, save off the root so that we can safely
* initialize the cache. We do the work to free tree members
@@ -132,16 +189,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode)
if (tree)
root = ci->ci_cache.ci_tree;
- ocfs2_metadata_cache_init(inode);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_reset(ci, 0);
+ ocfs2_metadata_cache_unlock(ci);
purged = ocfs2_purge_copied_metadata_tree(&root);
/* If possible, track the number wiped so that we can more
* easily detect counting errors. Unfortunately, this is only
* meaningful for trees. */
if (tree && purged != to_purge)
- mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n",
- (unsigned long long)oi->ip_blkno, to_purge, purged);
+ mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ to_purge, purged);
}
/* Returns the index in the cache array, -1 if not found.
@@ -182,39 +240,37 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
return NULL;
}
-static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
+static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
int index = -1;
struct ocfs2_meta_cache_item *item = NULL;
- spin_lock(&oi->ip_lock);
+ ocfs2_metadata_cache_lock(ci);
- mlog(0, "Inode %llu, query block %llu (inline = %u)\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long) bh->b_blocknr,
- !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
+ trace_ocfs2_buffer_cached_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long) bh->b_blocknr,
+ !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
- index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
- bh->b_blocknr);
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
+ index = ocfs2_search_cache_array(ci, bh->b_blocknr);
else
- item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
- bh->b_blocknr);
+ item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
- mlog(0, "index = %d, item = %p\n", index, item);
+ trace_ocfs2_buffer_cached_end(index, item);
return (index != -1) || (item != NULL);
}
/* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache.
- *
+ * the block is stored in our inode metadata cache.
+ *
* This can be called under lock_buffer()
*/
-int ocfs2_buffer_uptodate(struct inode *inode,
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
/* Doesn't matter if the bh is in our cache or not -- if it's
@@ -230,27 +286,28 @@ int ocfs2_buffer_uptodate(struct inode *inode,
/* Ok, locally the buffer is marked as up to date, now search
* our cache to see if we can trust that. */
- return ocfs2_buffer_cached(OCFS2_I(inode), bh);
+ return ocfs2_buffer_cached(ci, bh);
}
-/*
+/*
* Determine whether a buffer is currently out on a read-ahead request.
- * ip_io_sem should be held to serialize submitters with the logic here.
+ * ci_io_sem should be held to serialize submitters with the logic here.
*/
-int ocfs2_buffer_read_ahead(struct inode *inode,
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
+ return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
}
/* Requires ip_lock */
static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
sector_t block)
{
- BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
- mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
- ci->ci_num_cached);
+ trace_ocfs2_append_cache_array(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, ci->ci_num_cached);
ci->ci_cache.ci_array[ci->ci_num_cached] = block;
ci->ci_num_cached++;
@@ -267,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
struct ocfs2_meta_cache_item *tmp;
- mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
- ci->ci_num_cached);
+ trace_ocfs2_insert_cache_tree(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, ci->ci_num_cached);
while(*p) {
parent = *p;
@@ -292,67 +350,65 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
ci->ci_num_cached++;
}
-static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
- struct ocfs2_caching_info *ci)
+/* co_cache_lock() must be held */
+static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
{
- assert_spin_locked(&oi->ip_lock);
-
- return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
- (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
+ return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
+ (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
}
-/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
+/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
* pointers in tree after we use them - this allows caller to detect
- * when to free in case of error. */
-static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
+ * when to free in case of error.
+ *
+ * The co_cache_lock() must be held. */
+static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item **tree)
{
int i;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
- mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
- "Inode %llu, num cached = %u, should be %u\n",
- (unsigned long long)oi->ip_blkno, ci->ci_num_cached,
- OCFS2_INODE_MAX_CACHE_ARRAY);
- mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
- "Inode %llu not marked as inline anymore!\n",
- (unsigned long long)oi->ip_blkno);
- assert_spin_locked(&oi->ip_lock);
+ mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
+ "Owner %llu, num cached = %u, should be %u\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
+ mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
+ "Owner %llu not marked as inline anymore!\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci));
/* Be careful to initialize the tree members *first* because
* once the ci_tree is used, the array is junk... */
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
tree[i]->c_block = ci->ci_cache.ci_array[i];
- oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
+ ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
ci->ci_cache.ci_tree = RB_ROOT;
/* this will be set again by __ocfs2_insert_cache_tree */
ci->ci_num_cached = 0;
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
__ocfs2_insert_cache_tree(ci, tree[i]);
tree[i] = NULL;
}
- mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
- (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
+ trace_ocfs2_expand_cache(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_flags, ci->ci_num_cached);
}
/* Slow path function - memory allocation is necessary. See the
* comment above ocfs2_set_buffer_uptodate for more information. */
-static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
+static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
sector_t block,
int expand_tree)
{
int i;
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
struct ocfs2_meta_cache_item *new = NULL;
- struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
+ struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
{ NULL, };
- mlog(0, "Inode %llu, block %llu, expand = %d\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long)block, expand_tree);
+ trace_ocfs2_set_buffer_uptodate(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, expand_tree);
new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
if (!new) {
@@ -364,7 +420,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
if (expand_tree) {
/* Do *not* allocate an array here - the removal code
* has no way of tracking that. */
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
GFP_NOFS);
if (!tree[i]) {
@@ -376,21 +432,20 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
}
}
- spin_lock(&oi->ip_lock);
- if (ocfs2_insert_can_use_array(oi, ci)) {
- mlog(0, "Someone cleared the tree underneath us\n");
+ ocfs2_metadata_cache_lock(ci);
+ if (ocfs2_insert_can_use_array(ci)) {
/* Ok, items were removed from the cache in between
* locks. Detect this and revert back to the fast path */
ocfs2_append_cache_array(ci, block);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
goto out_free;
}
if (expand_tree)
- ocfs2_expand_cache(oi, tree);
+ ocfs2_expand_cache(ci, tree);
__ocfs2_insert_cache_tree(ci, new);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
new = NULL;
out_free:
@@ -400,14 +455,14 @@ out_free:
/* If these were used, then ocfs2_expand_cache re-set them to
* NULL for us. */
if (tree[0]) {
- for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
+ for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
if (tree[i])
kmem_cache_free(ocfs2_uptodate_cachep,
tree[i]);
}
}
-/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
+/* Item insertion is guarded by co_io_lock(), so the insertion path takes
* advantage of this by not rechecking for a duplicate insert during
* the slow case. Additionally, if the cache needs to be bumped up to
* a tree, the code will not recheck after acquiring the lock --
@@ -425,59 +480,55 @@ out_free:
* Readahead buffers can be passed in here before the I/O request is
* completed.
*/
-void ocfs2_set_buffer_uptodate(struct inode *inode,
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
int expand;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
/* The block may very well exist in our cache already, so avoid
* doing any more work in that case. */
- if (ocfs2_buffer_cached(oi, bh))
+ if (ocfs2_buffer_cached(ci, bh))
return;
- mlog(0, "Inode %llu, inserting block %llu\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_set_buffer_uptodate_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr);
/* No need to recheck under spinlock - insertion is guarded by
- * ip_io_mutex */
- spin_lock(&oi->ip_lock);
- if (ocfs2_insert_can_use_array(oi, ci)) {
+ * co_io_lock() */
+ ocfs2_metadata_cache_lock(ci);
+ if (ocfs2_insert_can_use_array(ci)) {
/* Fast case - it's an array and there's a free
* spot. */
ocfs2_append_cache_array(ci, bh->b_blocknr);
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
return;
}
expand = 0;
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
/* We need to bump things up to a tree. */
expand = 1;
}
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
- __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
+ __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
}
/* Called against a newly allocated buffer. Most likely nobody should
* be able to read this sort of metadata while it's still being
- * allocated, but this is careful to take ip_io_mutex anyway. */
-void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+ * allocated, but this is careful to take co_io_lock() anyway. */
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
/* This should definitely *not* exist in our cache */
- BUG_ON(ocfs2_buffer_cached(oi, bh));
+ BUG_ON(ocfs2_buffer_cached(ci, bh));
set_buffer_uptodate(bh);
- mutex_lock(&oi->ip_io_mutex);
- ocfs2_set_buffer_uptodate(inode, bh);
- mutex_unlock(&oi->ip_io_mutex);
+ ocfs2_metadata_cache_io_lock(ci);
+ ocfs2_set_buffer_uptodate(ci, bh);
+ ocfs2_metadata_cache_io_unlock(ci);
}
/* Requires ip_lock. */
@@ -487,12 +538,13 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
sector_t *array = ci->ci_cache.ci_array;
int bytes;
- BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
+ BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
BUG_ON(index >= ci->ci_num_cached);
BUG_ON(!ci->ci_num_cached);
- mlog(0, "remove index %d (num_cached = %u\n", index,
- ci->ci_num_cached);
+ trace_ocfs2_remove_metadata_array(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ index, ci->ci_num_cached);
ci->ci_num_cached--;
@@ -508,28 +560,27 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *item)
{
- mlog(0, "remove block %llu from tree\n",
- (unsigned long long) item->c_block);
+ trace_ocfs2_remove_metadata_tree(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)item->c_block);
rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
ci->ci_num_cached--;
}
-static void ocfs2_remove_block_from_cache(struct inode *inode,
+static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
sector_t block)
{
int index;
struct ocfs2_meta_cache_item *item = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
- spin_lock(&oi->ip_lock);
- mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n",
- (unsigned long long)oi->ip_blkno,
- (unsigned long long) block, ci->ci_num_cached,
- oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
+ ocfs2_metadata_cache_lock(ci);
+ trace_ocfs2_remove_block_from_cache(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long) block, ci->ci_num_cached,
+ ci->ci_flags);
- if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
+ if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
index = ocfs2_search_cache_array(ci, block);
if (index != -1)
ocfs2_remove_metadata_array(ci, index);
@@ -538,7 +589,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
if (item)
ocfs2_remove_metadata_tree(ci, item);
}
- spin_unlock(&oi->ip_lock);
+ ocfs2_metadata_cache_unlock(ci);
if (item)
kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -549,23 +600,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
* bother reverting things to an inlined array in the case of a remove
* which moves us back under the limit.
*/
-void ocfs2_remove_from_cache(struct inode *inode,
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
struct buffer_head *bh)
{
sector_t block = bh->b_blocknr;
- ocfs2_remove_block_from_cache(inode, block);
+ ocfs2_remove_block_from_cache(ci, block);
}
/* Called when we remove xattr clusters from an inode. */
-void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
sector_t block,
u32 c_len)
{
- unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+ struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
for (i = 0; i < b_len; i++, block++)
- ocfs2_remove_block_from_cache(inode, block);
+ ocfs2_remove_block_from_cache(ci, block);
}
int __init init_ocfs2_uptodate_cache(void)
@@ -576,9 +628,6 @@ int __init init_ocfs2_uptodate_cache(void)
if (!ocfs2_uptodate_cachep)
return -ENOMEM;
- mlog(0, "%u inlined cache items per inode.\n",
- OCFS2_INODE_MAX_CACHE_ARRAY);
-
return 0;
}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 531b4b3a0c4..0d826fe2da0 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -26,24 +26,59 @@
#ifndef OCFS2_UPTODATE_H
#define OCFS2_UPTODATE_H
+/*
+ * The caching code relies on locking provided by the user of
+ * struct ocfs2_caching_info. These operations connect that up.
+ */
+struct ocfs2_caching_operations {
+ /*
+ * A u64 representing the owning structure. Usually this
+ * is the block number (i_blkno or whatnot). This is used so
+ * that caching log messages can identify the owning structure.
+ */
+ u64 (*co_owner)(struct ocfs2_caching_info *ci);
+
+ /* The superblock is needed during I/O. */
+ struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
+ /*
+ * Lock and unlock the caching data. These will not sleep, and
+ * should probably be spinlocks.
+ */
+ void (*co_cache_lock)(struct ocfs2_caching_info *ci);
+ void (*co_cache_unlock)(struct ocfs2_caching_info *ci);
+
+ /*
+ * Lock and unlock for disk I/O. These will sleep, and should
+ * be mutexes.
+ */
+ void (*co_io_lock)(struct ocfs2_caching_info *ci);
+ void (*co_io_unlock)(struct ocfs2_caching_info *ci);
+};
+
int __init init_ocfs2_uptodate_cache(void);
void exit_ocfs2_uptodate_cache(void);
-void ocfs2_metadata_cache_init(struct inode *inode);
-void ocfs2_metadata_cache_purge(struct inode *inode);
+void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
+ const struct ocfs2_caching_operations *ops);
+void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
+
+u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
+void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
-int ocfs2_buffer_uptodate(struct inode *inode,
+int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_set_buffer_uptodate(struct inode *inode,
+void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_set_new_buffer_uptodate(struct inode *inode,
+void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_remove_from_cache(struct inode *inode,
+void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
-void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
sector_t block,
u32 c_len);
-int ocfs2_buffer_read_ahead(struct inode *inode,
+int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
struct buffer_head *bh);
#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a..00000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2..00000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-
-void ocfs2_print_version(void);
-
-#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 15631019dc6..016f01df382 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,7 +37,6 @@
#include <linux/string.h>
#include <linux/security.h>
-#define MLOG_MASK_PREFIX ML_XATTR
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -55,7 +54,9 @@
#include "buffer_head_io.h"
#include "super.h"
#include "xattr.h"
-
+#include "refcounttree.h"
+#include "acl.h"
+#include "ocfs2_trace.h"
struct ocfs2_xattr_def_value_root {
struct ocfs2_xattr_value_root xv;
@@ -78,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
struct ocfs2_alloc_context *meta_ac;
struct ocfs2_alloc_context *data_ac;
struct ocfs2_cached_dealloc_ctxt dealloc;
+ int set_abort;
};
#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -95,34 +97,31 @@ static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
};
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
- &ocfs2_xattr_acl_access_handler,
- &ocfs2_xattr_acl_default_handler,
-#endif
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
};
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
- = &ocfs2_xattr_acl_access_handler,
+ = &posix_acl_access_xattr_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
- = &ocfs2_xattr_acl_default_handler,
-#endif
+ = &posix_acl_default_xattr_handler,
[OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
[OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
struct ocfs2_xattr_info {
- int name_index;
- const char *name;
- const void *value;
- size_t value_len;
+ int xi_name_index;
+ const char *xi_name;
+ int xi_name_len;
+ const void *xi_value;
+ size_t xi_value_len;
};
struct ocfs2_xattr_search {
@@ -140,7 +139,116 @@ struct ocfs2_xattr_search {
int not_found;
};
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+ /*
+ * Journal functions
+ */
+ int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+ int type);
+ void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+
+ /*
+ * Return a pointer to the appropriate buffer in loc->xl_storage
+ * at the given offset from loc->xl_header.
+ */
+ void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+
+ /* Can we reuse the existing entry for the new value? */
+ int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi);
+
+ /* How much space is needed for the new value? */
+ int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi);
+
+ /*
+ * Return the offset of the first name+value pair. This is
+ * the start of our downward-filling free space.
+ */
+ int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+
+ /*
+ * Remove the name+value at this location. Do whatever is
+ * appropriate with the remaining name+value pairs.
+ */
+ void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+
+ /* Fill xl_entry with a new entry */
+ void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+
+ /* Add name+value storage to an entry */
+ void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+
+ /*
+ * Initialize the value buf's access and bh fields for this entry.
+ * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+ */
+ void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb);
+};
+
+/*
+ * Describes an xattr entry location. This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+ /* This xattr belongs to this inode */
+ struct inode *xl_inode;
+
+ /* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+ struct ocfs2_xattr_header *xl_header;
+
+ /* Bytes from xl_header to the end of the storage */
+ int xl_size;
+
+ /*
+ * The ocfs2_xattr_entry this location describes. If this is
+ * NULL, this location describes the on-disk structure where it
+ * would have been.
+ */
+ struct ocfs2_xattr_entry *xl_entry;
+
+ /*
+ * Internal housekeeping
+ */
+
+ /* Buffer(s) containing this entry */
+ void *xl_storage;
+
+ /* Operations on the storage backing this location */
+ const struct ocfs2_xa_loc_operations *xl_ops;
+};
+
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+ if (value_len > OCFS2_XATTR_INLINE_SIZE)
+ return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ else
+ return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+ return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+ u64 value_len = le64_to_cpu(xe->xe_value_size);
+
+ BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+ ocfs2_xattr_is_local(xe));
+ return namevalue_size(xe->xe_name_len, value_len);
+}
+
+
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
struct ocfs2_xattr_header *xh,
int index,
int *block_off,
@@ -157,7 +265,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
struct ocfs2_xattr_search *xs);
static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
- struct ocfs2_xattr_tree_root *xt,
+ struct buffer_head *blk_bh,
char *buffer,
size_t buffer_size);
@@ -170,12 +278,40 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
struct ocfs2_xattr_search *xs,
struct ocfs2_xattr_set_ctxt *ctxt);
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
- struct buffer_head *xb_bh);
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+ struct buffer_head *root_bh,
+ xattr_tree_rec_func *rec_func,
+ void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para);
+
static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
u64 src_blk, u64 last_blk, u64 to_blk,
unsigned int start_bucket,
u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_refcount_tree **ref_tree,
+ int *meta_need,
+ int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+ struct ocfs2_xattr_bucket *bucket,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **bh);
static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
{
@@ -187,14 +323,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
}
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
- u16 len = sb->s_blocksize -
- offsetof(struct ocfs2_xattr_header, xh_entries);
-
- return len / sizeof(struct ocfs2_xattr_entry);
-}
-
#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -241,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
* them fully.
*/
static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
- u64 xb_blkno)
+ u64 xb_blkno, int new)
{
int i, rc = 0;
@@ -249,15 +377,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
xb_blkno + i);
if (!bucket->bu_bhs[i]) {
- rc = -EIO;
+ rc = -ENOMEM;
mlog_errno(rc);
break;
}
- if (!ocfs2_buffer_uptodate(bucket->bu_inode,
- bucket->bu_bhs[i]))
- ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
- bucket->bu_bhs[i]);
+ if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i])) {
+ if (new)
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ else {
+ set_buffer_uptodate(bucket->bu_bhs[i]);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ }
+ }
}
if (rc)
@@ -271,7 +406,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
{
int rc;
- rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+ rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
bucket->bu_blocks, bucket->bu_bhs, 0,
NULL);
if (!rc) {
@@ -297,7 +432,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
int i, rc = 0;
for (i = 0; i < bucket->bu_blocks; i++) {
- rc = ocfs2_journal_access(handle, bucket->bu_inode,
+ rc = ocfs2_journal_access(handle,
+ INODE_CACHE(bucket->bu_inode),
bucket->bu_bhs[i], type);
if (rc) {
mlog_errno(rc);
@@ -345,8 +481,7 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)bh->b_data;
- mlog(0, "Validating xattr block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -399,7 +534,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
int rc;
struct buffer_head *tmp = *bh;
- rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+ rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
ocfs2_validate_xattr_block);
/* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -411,7 +546,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
static inline const char *ocfs2_xattr_prefix(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
@@ -437,35 +572,22 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
return hash;
}
-/*
- * ocfs2_xattr_hash_entry()
- *
- * Compute the hash of an extended attribute.
- */
-static void ocfs2_xattr_hash_entry(struct inode *inode,
- struct ocfs2_xattr_header *header,
- struct ocfs2_xattr_entry *entry)
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
{
- u32 hash = 0;
- char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
-
- hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
- entry->xe_name_hash = cpu_to_le32(hash);
-
- return;
+ return namevalue_size(name_len, value_len) +
+ sizeof(struct ocfs2_xattr_entry);
}
-static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
{
- int size = 0;
-
- if (value_len <= OCFS2_XATTR_INLINE_SIZE)
- size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
- else
- size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
- size += sizeof(struct ocfs2_xattr_entry);
+ return namevalue_size_xi(xi) +
+ sizeof(struct ocfs2_xattr_entry);
+}
- return size;
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+ return namevalue_size_xe(xe) +
+ sizeof(struct ocfs2_xattr_entry);
}
int ocfs2_calc_security_init(struct inode *dir,
@@ -508,7 +630,7 @@ int ocfs2_calc_security_init(struct inode *dir,
int ocfs2_calc_xattr_init(struct inode *dir,
struct buffer_head *dir_bh,
- int mode,
+ umode_t mode,
struct ocfs2_security_xattr_info *si,
int *want_clusters,
int *xattr_credits,
@@ -593,55 +715,61 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
struct ocfs2_xattr_value_buf *vb,
struct ocfs2_xattr_set_ctxt *ctxt)
{
- int status = 0;
+ int status = 0, credits;
handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
struct ocfs2_extent_tree et;
- mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-
- ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- status = vb->vb_access(handle, inode, vb->vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ while (clusters_to_add) {
+ trace_ocfs2_xattr_extend_allocation(clusters_to_add);
- prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
- status = ocfs2_add_clusters_in_btree(osb,
- inode,
- &logical_start,
- clusters_to_add,
- 0,
- &et,
- handle,
- ctxt->data_ac,
- ctxt->meta_ac,
- &why);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ break;
+ }
- status = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(handle,
+ &et,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ ctxt->data_ac,
+ ctxt->meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ break;
+ }
- clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+ ocfs2_journal_dirty(handle, vb->vb_bh);
- /*
- * We should have already allocated enough space before the transaction,
- * so no need to restart.
- */
- BUG_ON(why != RESTART_NONE || clusters_to_add);
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+ prev_clusters;
-leave:
+ if (why != RESTART_NONE && clusters_to_add) {
+ /*
+ * We can only fail in case the alloc file doesn't give
+ * up enough clusters.
+ */
+ BUG_ON(why == RESTART_META);
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &vb->vb_xv->xr_list);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ break;
+ }
+ }
+ }
return status;
}
@@ -649,6 +777,7 @@ leave:
static int __ocfs2_remove_xattr_range(struct inode *inode,
struct ocfs2_xattr_value_buf *vb,
u32 cpos, u32 phys_cpos, u32 len,
+ unsigned int ext_flags,
struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret;
@@ -656,16 +785,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
handle_t *handle = ctxt->handle;
struct ocfs2_extent_tree et;
- ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- ret = vb->vb_access(handle, inode, vb->vb_bh,
+ ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
+ ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
&ctxt->dealloc);
if (ret) {
mlog_errno(ret);
@@ -673,14 +802,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
}
le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+ ocfs2_journal_dirty(handle, vb->vb_bh);
- ret = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(inode->i_sb,
+ phys_blkno),
+ len, ctxt->meta_ac, &ctxt->dealloc, 1);
+ else
+ ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+ phys_blkno, len);
if (ret)
mlog_errno(ret);
@@ -695,6 +826,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret = 0;
+ unsigned int ext_flags;
u32 trunc_len, cpos, phys_cpos, alloc_size;
u64 block;
@@ -706,7 +838,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
while (trunc_len) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
&alloc_size,
- &vb->vb_xv->xr_list);
+ &vb->vb_xv->xr_list, &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
@@ -717,15 +849,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
phys_cpos, alloc_size,
- ctxt);
+ ext_flags, ctxt);
if (ret) {
mlog_errno(ret);
goto out;
}
block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
- ocfs2_remove_xattr_clusters_from_cache(inode, block,
- alloc_size);
+ ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
+ block, alloc_size);
cpos += alloc_size;
trunc_len -= alloc_size;
}
@@ -810,6 +942,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
return result;
}
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ struct ocfs2_xattr_header *xh;
+ int i;
+
+ xh = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+ if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+ return 1;
+
+ return 0;
+}
+
static int ocfs2_xattr_ibody_list(struct inode *inode,
struct ocfs2_dinode *di,
char *buffer,
@@ -855,11 +1004,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
ret = ocfs2_xattr_list_entries(inode, header,
buffer, buffer_size);
- } else {
- struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
- ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+ } else
+ ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
buffer, buffer_size);
- }
brelse(blk_bh);
@@ -961,7 +1108,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
cpos = 0;
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
- &num_clusters, el);
+ &num_clusters, el, NULL);
if (ret) {
mlog_errno(ret);
goto out;
@@ -970,7 +1117,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
/* Copy ocfs2_xattr_value */
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh, NULL);
+ ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+ &bh, NULL);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1052,7 +1200,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
struct ocfs2_xattr_block *xb;
struct ocfs2_xattr_value_root *xv;
size_t size;
- int ret = -ENODATA, name_offset, name_len, block_off, i;
+ int ret = -ENODATA, name_offset, name_len, i;
+ int uninitialized_var(block_off);
xs->bucket = ocfs2_xattr_bucket_new(inode);
if (!xs->bucket) {
@@ -1084,7 +1233,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
i = xs->here - xs->header->xh_entries;
if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
bucket_xh(xs->bucket),
i,
&block_off,
@@ -1140,13 +1289,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
- down_read(&oi->ip_xattr_sem);
ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size, &xis);
if (ret == -ENODATA && di->i_xattr_loc)
ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
buffer_size, &xbs);
- up_read(&oi->ip_xattr_sem);
return ret;
}
@@ -1170,8 +1317,10 @@ static int ocfs2_xattr_get(struct inode *inode,
mlog_errno(ret);
return ret;
}
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
name, buffer, buffer_size);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 0);
@@ -1182,7 +1331,7 @@ static int ocfs2_xattr_get(struct inode *inode,
static int __ocfs2_xattr_set_value_outside(struct inode *inode,
handle_t *handle,
- struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_xattr_value_buf *vb,
const void *value,
int value_len)
{
@@ -1193,28 +1342,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
u64 blkno;
struct buffer_head *bh = NULL;
+ unsigned int ext_flags;
+ struct ocfs2_xattr_value_root *xv = vb->vb_xv;
BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
- &num_clusters, &xv->xr_list);
+ &num_clusters, &xv->xr_list,
+ &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
+ BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh, NULL);
+ ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
+ &bh, NULL);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access(handle,
- inode,
+ INODE_CACHE(inode),
bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
@@ -1230,11 +1385,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
memset(bh->b_data + cp_len, 0,
blocksize - cp_len);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
bh = NULL;
@@ -1253,497 +1404,1009 @@ out:
return ret;
}
-static int ocfs2_xattr_cleanup(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_value_buf *vb,
- size_t offs)
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
+ int num_entries)
{
- int ret = 0;
- size_t name_len = strlen(xi->name);
- void *val = xs->base + offs;
- size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ int free_space;
- ret = vb->vb_access(handle, inode, vb->vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- /* Decrease xattr count */
- le16_add_cpu(&xs->header->xh_count, -1);
- /* Remove the xattr entry and tree root which has already be set*/
- memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
- memset(val, 0, size);
+ if (!needed_space)
+ return 0;
- ret = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (ret < 0)
- mlog_errno(ret);
-out:
- return ret;
+ free_space = free_start -
+ sizeof(struct ocfs2_xattr_header) -
+ (num_entries * sizeof(struct ocfs2_xattr_entry)) -
+ OCFS2_XATTR_HEADER_GAP;
+ if (free_space < 0)
+ return -EIO;
+ if (free_space < needed_space)
+ return -ENOSPC;
+
+ return 0;
}
-static int ocfs2_xattr_update_entry(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_value_buf *vb,
- size_t offs)
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
+ int type)
{
- int ret;
+ return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
- ret = vb->vb_access(handle, inode, vb->vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
+{
+ loc->xl_ops->xlo_journal_dirty(handle, loc);
+}
- xs->here->xe_name_offset = cpu_to_le16(offs);
- xs->here->xe_value_size = cpu_to_le64(xi->value_len);
- if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
- ocfs2_xattr_set_local(xs->here, 1);
- else
- ocfs2_xattr_set_local(xs->here, 0);
- ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+/* Give a pointer into the storage for the given offset */
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
+{
+ BUG_ON(offset >= loc->xl_size);
+ return loc->xl_ops->xlo_offset_pointer(loc, offset);
+}
- ret = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (ret < 0)
- mlog_errno(ret);
-out:
- return ret;
+/*
+ * Wipe the name+value pair and allow the storage to reclaim it. This
+ * must be followed by either removal of the entry or a call to
+ * ocfs2_xa_add_namevalue().
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+ loc->xl_ops->xlo_wipe_namevalue(loc);
}
/*
- * ocfs2_xattr_set_value_outside()
- *
- * Set large size value in B tree.
+ * Find lowest offset to a name+value pair. This is the start of our
+ * downward-growing free space.
*/
-static int ocfs2_xattr_set_value_outside(struct inode *inode,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_set_ctxt *ctxt,
- struct ocfs2_xattr_value_buf *vb,
- size_t offs)
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
{
- size_t name_len = strlen(xi->name);
- void *val = xs->base + offs;
- struct ocfs2_xattr_value_root *xv = NULL;
- size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
- int ret = 0;
+ return loc->xl_ops->xlo_get_free_start(loc);
+}
- memset(val, 0, size);
- memcpy(val, xi->name, name_len);
- xv = (struct ocfs2_xattr_value_root *)
- (val + OCFS2_XATTR_SIZE(name_len));
- xv->xr_clusters = 0;
- xv->xr_last_eb_blk = 0;
- xv->xr_list.l_tree_depth = 0;
- xv->xr_list.l_count = cpu_to_le16(1);
- xv->xr_list.l_next_free_rec = 0;
- vb->vb_xv = xv;
-
- ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
+/* Can we reuse loc->xl_entry for xi? */
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ return loc->xl_ops->xlo_can_reuse(loc, xi);
+}
+
+/* How much free space is needed to set the new value */
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ return loc->xl_ops->xlo_check_space(loc, xi);
+}
+
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ loc->xl_ops->xlo_add_entry(loc, name_hash);
+ loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+ /*
+ * We can't leave the new entry's xe_name_offset at zero or
+ * add_namevalue() will go nuts. We set it to the size of our
+ * storage so that it can never be less than any other entry.
+ */
+ loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ int size = namevalue_size_xi(xi);
+ int nameval_offset;
+ char *nameval_buf;
+
+ loc->xl_ops->xlo_add_namevalue(loc, size);
+ loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+ loc->xl_entry->xe_name_len = xi->xi_name_len;
+ ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+ ocfs2_xattr_set_local(loc->xl_entry,
+ xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+
+ nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+ memset(nameval_buf, 0, size);
+ memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+
+ /* Value bufs are for value trees */
+ BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
+ BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+ (name_size + OCFS2_XATTR_ROOT_SIZE));
+
+ loc->xl_ops->xlo_fill_value_buf(loc, vb);
+ vb->vb_xv =
+ (struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+ nameval_offset +
+ name_size);
+}
+
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+ struct ocfs2_xa_loc *loc, int type)
+{
+ struct buffer_head *bh = loc->xl_storage;
+ ocfs2_journal_access_func access;
+
+ if (loc->xl_size == (bh->b_size -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_header)))
+ access = ocfs2_journal_access_xb;
+ else
+ access = ocfs2_journal_access_di;
+ return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+ struct ocfs2_xa_loc *loc)
+{
+ struct buffer_head *bh = loc->xl_storage;
+
+ ocfs2_journal_dirty(handle, bh);
+}
+
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+ int offset)
+{
+ return (char *)loc->xl_header + offset;
+}
+
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ /*
+ * Block storage is strict. If the sizes aren't exact, we will
+ * remove the old one and reinsert the new.
+ */
+ return namevalue_size_xe(loc->xl_entry) ==
+ namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ int i, count = le16_to_cpu(xh->xh_count);
+ int offset, free_start = loc->xl_size;
+
+ for (i = 0; i < count; i++) {
+ offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+ if (offset < free_start)
+ free_start = offset;
}
- ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
+
+ return free_start;
+}
+
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ int count = le16_to_cpu(loc->xl_header->xh_count);
+ int free_start = ocfs2_xa_get_free_start(loc);
+ int needed_space = ocfs2_xi_entry_usage(xi);
+
+ /*
+ * Block storage will reclaim the original entry before inserting
+ * the new value, so we only need the difference. If the new
+ * entry is smaller than the old one, we don't need anything.
+ */
+ if (loc->xl_entry) {
+ /* Don't need space if we're reusing! */
+ if (ocfs2_xa_can_reuse_entry(loc, xi))
+ needed_space = 0;
+ else
+ needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
}
- ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
- xi->value, xi->value_len);
- if (ret < 0)
- mlog_errno(ret);
+ if (needed_space < 0)
+ needed_space = 0;
+ return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
- return ret;
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted. When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+ int i, offset;
+ int namevalue_offset, first_namevalue_offset, namevalue_size;
+ struct ocfs2_xattr_entry *entry = loc->xl_entry;
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ int count = le16_to_cpu(xh->xh_count);
+
+ namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+ namevalue_size = namevalue_size_xe(entry);
+ first_namevalue_offset = ocfs2_xa_get_free_start(loc);
+
+ /* Shift the name+value pairs */
+ memmove((char *)xh + first_namevalue_offset + namevalue_size,
+ (char *)xh + first_namevalue_offset,
+ namevalue_offset - first_namevalue_offset);
+ memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+
+ /* Now tell xh->xh_entries about it */
+ for (i = 0; i < count; i++) {
+ offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+ if (offset <= namevalue_offset)
+ le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+ namevalue_size);
+ }
+
+ /*
+ * Note that we don't update xh_free_start or xh_name_value_len
+ * because they're not used in block-stored xattrs.
+ */
+}
+
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ int count = le16_to_cpu(loc->xl_header->xh_count);
+ loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+ le16_add_cpu(&loc->xl_header->xh_count, 1);
+ memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+ int free_start = ocfs2_xa_get_free_start(loc);
+
+ loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ struct buffer_head *bh = loc->xl_storage;
+
+ if (loc->xl_size == (bh->b_size -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_header)))
+ vb->vb_access = ocfs2_journal_access_xb;
+ else
+ vb->vb_access = ocfs2_journal_access_di;
+ vb->vb_bh = bh;
}
/*
- * ocfs2_xattr_set_entry_local()
- *
- * Set, replace or remove extended attribute in local.
+ * Operations for xattrs stored in blocks. This includes inline inode
+ * storage and unindexed ocfs2_xattr_blocks.
*/
-static void ocfs2_xattr_set_entry_local(struct inode *inode,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_entry *last,
- size_t min_offs)
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+ .xlo_journal_access = ocfs2_xa_block_journal_access,
+ .xlo_journal_dirty = ocfs2_xa_block_journal_dirty,
+ .xlo_offset_pointer = ocfs2_xa_block_offset_pointer,
+ .xlo_check_space = ocfs2_xa_block_check_space,
+ .xlo_can_reuse = ocfs2_xa_block_can_reuse,
+ .xlo_get_free_start = ocfs2_xa_block_get_free_start,
+ .xlo_wipe_namevalue = ocfs2_xa_block_wipe_namevalue,
+ .xlo_add_entry = ocfs2_xa_block_add_entry,
+ .xlo_add_namevalue = ocfs2_xa_block_add_namevalue,
+ .xlo_fill_value_buf = ocfs2_xa_block_fill_value_buf,
+};
+
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+ struct ocfs2_xa_loc *loc, int type)
{
- size_t name_len = strlen(xi->name);
- int i;
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
- if (xi->value && xs->not_found) {
- /* Insert the new xattr entry. */
- le16_add_cpu(&xs->header->xh_count, 1);
- ocfs2_xattr_set_type(last, xi->name_index);
- ocfs2_xattr_set_local(last, 1);
- last->xe_name_len = name_len;
- } else {
- void *first_val;
- void *val;
- size_t offs, size;
-
- first_val = xs->base + min_offs;
- offs = le16_to_cpu(xs->here->xe_name_offset);
- val = xs->base + offs;
-
- if (le64_to_cpu(xs->here->xe_value_size) >
- OCFS2_XATTR_INLINE_SIZE)
- size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_ROOT_SIZE;
+ return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
+}
+
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
+ struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+}
+
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
+ int offset)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+ int block, block_offset;
+
+ /* The header is at the front of the bucket */
+ block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+ block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
+
+ return bucket_block(bucket, block) + block_offset;
+}
+
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ return namevalue_size_xe(loc->xl_entry) >=
+ namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+ return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+ int free_start, int size)
+{
+ /*
+ * We need to make sure that the name+value pair fits within
+ * one block.
+ */
+ if (((free_start - size) >> sb->s_blocksize_bits) !=
+ ((free_start - 1) >> sb->s_blocksize_bits))
+ free_start -= free_start % sb->s_blocksize;
+
+ return free_start;
+}
+
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi)
+{
+ int rc;
+ int count = le16_to_cpu(loc->xl_header->xh_count);
+ int free_start = ocfs2_xa_get_free_start(loc);
+ int needed_space = ocfs2_xi_entry_usage(xi);
+ int size = namevalue_size_xi(xi);
+ struct super_block *sb = loc->xl_inode->i_sb;
+
+ /*
+ * Bucket storage does not reclaim name+value pairs it cannot
+ * reuse. They live as holes until the bucket fills, and then
+ * the bucket is defragmented. However, the bucket can reclaim
+ * the ocfs2_xattr_entry.
+ */
+ if (loc->xl_entry) {
+ /* Don't need space if we're reusing! */
+ if (ocfs2_xa_can_reuse_entry(loc, xi))
+ needed_space = 0;
else
- size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-
- if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(xi->value_len)) {
- /* The old and the new value have the
- same size. Just replace the value. */
- ocfs2_xattr_set_local(xs->here, 1);
- xs->here->xe_value_size = cpu_to_le64(xi->value_len);
- /* Clear value bytes. */
- memset(val + OCFS2_XATTR_SIZE(name_len),
- 0,
- OCFS2_XATTR_SIZE(xi->value_len));
- memcpy(val + OCFS2_XATTR_SIZE(name_len),
- xi->value,
- xi->value_len);
- return;
- }
- /* Remove the old name+value. */
- memmove(first_val + size, first_val, val - first_val);
- memset(first_val, 0, size);
- xs->here->xe_name_hash = 0;
- xs->here->xe_name_offset = 0;
- ocfs2_xattr_set_local(xs->here, 1);
- xs->here->xe_value_size = 0;
-
- min_offs += size;
-
- /* Adjust all value offsets. */
- last = xs->header->xh_entries;
- for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
- size_t o = le16_to_cpu(last->xe_name_offset);
-
- if (o < offs)
- last->xe_name_offset = cpu_to_le16(o + size);
- last += 1;
- }
+ needed_space -= sizeof(struct ocfs2_xattr_entry);
+ }
+ BUG_ON(needed_space < 0);
- if (!xi->value) {
- /* Remove the old entry. */
- last -= 1;
- memmove(xs->here, xs->here + 1,
- (void *)last - (void *)xs->here);
- memset(last, 0, sizeof(struct ocfs2_xattr_entry));
- le16_add_cpu(&xs->header->xh_count, -1);
- }
+ if (free_start < size) {
+ if (needed_space)
+ return -ENOSPC;
+ } else {
+ /*
+ * First we check if it would fit in the first place.
+ * Below, we align the free start to a block. This may
+ * slide us below the minimum gap. By checking unaligned
+ * first, we avoid that error.
+ */
+ rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+ count);
+ if (rc)
+ return rc;
+ free_start = ocfs2_bucket_align_free_start(sb, free_start,
+ size);
}
- if (xi->value) {
- /* Insert the new name+value. */
- size_t size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(xi->value_len);
- void *val = xs->base + min_offs - size;
+ return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
- xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
- memset(val, 0, size);
- memcpy(val, xi->name, name_len);
- memcpy(val + OCFS2_XATTR_SIZE(name_len),
- xi->value,
- xi->value_len);
- xs->here->xe_value_size = cpu_to_le64(xi->value_len);
- ocfs2_xattr_set_local(xs->here, 1);
- ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+ le16_add_cpu(&loc->xl_header->xh_name_value_len,
+ -namevalue_size_xe(loc->xl_entry));
+}
+
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ int count = le16_to_cpu(xh->xh_count);
+ int low = 0, high = count - 1, tmp;
+ struct ocfs2_xattr_entry *tmp_xe;
+
+ /*
+ * We keep buckets sorted by name_hash, so we need to find
+ * our insert place.
+ */
+ while (low <= high && count) {
+ tmp = (low + high) / 2;
+ tmp_xe = &xh->xh_entries[tmp];
+
+ if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+ low = tmp + 1;
+ else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+ high = tmp - 1;
+ else {
+ low = tmp;
+ break;
+ }
}
- return;
+ if (low != count)
+ memmove(&xh->xh_entries[low + 1],
+ &xh->xh_entries[low],
+ ((count - low) * sizeof(struct ocfs2_xattr_entry)));
+
+ le16_add_cpu(&xh->xh_count, 1);
+ loc->xl_entry = &xh->xh_entries[low];
+ memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+ int free_start = ocfs2_xa_get_free_start(loc);
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ struct super_block *sb = loc->xl_inode->i_sb;
+ int nameval_offset;
+
+ free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+ nameval_offset = free_start - size;
+ loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+ xh->xh_free_start = cpu_to_le16(nameval_offset);
+ le16_add_cpu(&xh->xh_name_value_len, size);
+
+}
+
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_value_buf *vb)
+{
+ struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+ struct super_block *sb = loc->xl_inode->i_sb;
+ int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ int size = namevalue_size_xe(loc->xl_entry);
+ int block_offset = nameval_offset >> sb->s_blocksize_bits;
+
+ /* Values are not allowed to straddle block boundaries */
+ BUG_ON(block_offset !=
+ ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+ /* We expect the bucket to be filled in */
+ BUG_ON(!bucket->bu_bhs[block_offset]);
+
+ vb->vb_access = ocfs2_journal_access;
+ vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+ .xlo_journal_access = ocfs2_xa_bucket_journal_access,
+ .xlo_journal_dirty = ocfs2_xa_bucket_journal_dirty,
+ .xlo_offset_pointer = ocfs2_xa_bucket_offset_pointer,
+ .xlo_check_space = ocfs2_xa_bucket_check_space,
+ .xlo_can_reuse = ocfs2_xa_bucket_can_reuse,
+ .xlo_get_free_start = ocfs2_xa_bucket_get_free_start,
+ .xlo_wipe_namevalue = ocfs2_xa_bucket_wipe_namevalue,
+ .xlo_add_entry = ocfs2_xa_bucket_add_entry,
+ .xlo_add_namevalue = ocfs2_xa_bucket_add_namevalue,
+ .xlo_fill_value_buf = ocfs2_xa_bucket_fill_value_buf,
+};
+
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+ struct ocfs2_xattr_value_buf vb;
+
+ if (ocfs2_xattr_is_local(loc->xl_entry))
+ return 0;
+
+ ocfs2_xa_fill_value_buf(loc, &vb);
+ return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int trunc_rc, access_rc;
+ struct ocfs2_xattr_value_buf vb;
+
+ ocfs2_xa_fill_value_buf(loc, &vb);
+ trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+ ctxt);
+
+ /*
+ * The caller of ocfs2_xa_value_truncate() has already called
+ * ocfs2_xa_journal_access on the loc. However, The truncate code
+ * calls ocfs2_extend_trans(). This may commit the previous
+ * transaction and open a new one. If this is a bucket, truncate
+ * could leave only vb->vb_bh set up for journaling. Meanwhile,
+ * the caller is expecting to dirty the entire bucket. So we must
+ * reset the journal work. We do this even if truncate has failed,
+ * as it could have failed after committing the extend.
+ */
+ access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+
+ /* Errors in truncate take precedence */
+ return trunc_rc ? trunc_rc : access_rc;
+}
+
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+ int index, count;
+ struct ocfs2_xattr_header *xh = loc->xl_header;
+ struct ocfs2_xattr_entry *entry = loc->xl_entry;
+
+ ocfs2_xa_wipe_namevalue(loc);
+ loc->xl_entry = NULL;
+
+ le16_add_cpu(&xh->xh_count, -1);
+ count = le16_to_cpu(xh->xh_count);
+
+ /*
+ * Only zero out the entry if there are more remaining. This is
+ * important for an empty bucket, as it keeps track of the
+ * bucket's hash value. It doesn't hurt empty block storage.
+ */
+ if (count) {
+ index = ((char *)entry - (char *)&xh->xh_entries) /
+ sizeof(struct ocfs2_xattr_entry);
+ memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+ (count - index) * sizeof(struct ocfs2_xattr_entry));
+ memset(&xh->xh_entries[count], 0,
+ sizeof(struct ocfs2_xattr_entry));
+ }
}
/*
- * ocfs2_xattr_set_entry()
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state. For example, the value may be partially
+ * truncated.
*
- * Set extended attribute entry into inode or block.
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do. The caller can treat it as a straight error.
*
- * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
- * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
- * then set value in B tree with set_value_outside().
+ * If the value tree got partially truncated, we now have a corrupted
+ * extended attribute. We're going to wipe its entry and leak the
+ * clusters. Better to leak some storage than leave a corrupt entry.
+ *
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry. We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place. If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
*/
-static int ocfs2_xattr_set_entry(struct inode *inode,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_set_ctxt *ctxt,
- int flag)
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
+ const char *what,
+ unsigned int orig_clusters)
{
- struct ocfs2_xattr_entry *last;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
- size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
- size_t size_l = 0;
- handle_t *handle = ctxt->handle;
- int free, i, ret;
- struct ocfs2_xattr_info xi_l = {
- .name_index = xi->name_index,
- .name = xi->name,
- .value = xi->value,
- .value_len = xi->value_len,
- };
- struct ocfs2_xattr_value_buf vb = {
- .vb_bh = xs->xattr_bh,
- .vb_access = ocfs2_journal_access_di,
- };
-
- if (!(flag & OCFS2_INLINE_XATTR_FL)) {
- BUG_ON(xs->xattr_bh == xs->inode_bh);
- vb.vb_access = ocfs2_journal_access_xb;
- } else
- BUG_ON(xs->xattr_bh != xs->inode_bh);
+ unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
+ char *nameval_buf = ocfs2_xa_offset_pointer(loc,
+ le16_to_cpu(loc->xl_entry->xe_name_offset));
+
+ if (new_clusters < orig_clusters) {
+ mlog(ML_ERROR,
+ "Partial truncate while %s xattr %.*s. Leaking "
+ "%u clusters and removing the entry\n",
+ what, loc->xl_entry->xe_name_len, nameval_buf,
+ orig_clusters - new_clusters);
+ ocfs2_xa_remove_entry(loc);
+ } else if (!orig_clusters) {
+ mlog(ML_ERROR,
+ "Unable to allocate an external value for xattr "
+ "%.*s safely. Leaking %u clusters and removing the "
+ "entry\n",
+ loc->xl_entry->xe_name_len, nameval_buf,
+ new_clusters - orig_clusters);
+ ocfs2_xa_remove_entry(loc);
+ } else if (new_clusters > orig_clusters)
+ mlog(ML_ERROR,
+ "Unable to grow xattr %.*s safely. %u new clusters "
+ "have been added, but the value will not be "
+ "modified\n",
+ loc->xl_entry->xe_name_len, nameval_buf,
+ new_clusters - orig_clusters);
+}
- /* Compute min_offs, last and free space. */
- last = xs->header->xh_entries;
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ unsigned int orig_clusters;
- for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
- size_t offs = le16_to_cpu(last->xe_name_offset);
- if (offs < min_offs)
- min_offs = offs;
- last += 1;
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ /*
+ * Since this is remove, we can return 0 if
+ * ocfs2_xa_cleanup_value_truncate() is going to
+ * wipe the entry anyway. So we check the
+ * cluster count as well.
+ */
+ if (orig_clusters != ocfs2_xa_value_clusters(loc))
+ rc = 0;
+ ocfs2_xa_cleanup_value_truncate(loc, "removing",
+ orig_clusters);
+ if (rc)
+ goto out;
+ }
}
- free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
- if (free < 0)
- return -EIO;
+ ocfs2_xa_remove_entry(loc);
- if (!xs->not_found) {
- size_t size = 0;
- if (ocfs2_xattr_is_local(xs->here))
- size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
- else
- size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_ROOT_SIZE;
- free += (size + sizeof(struct ocfs2_xattr_entry));
- }
- /* Check free space in inode or block */
- if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
- if (free < sizeof(struct ocfs2_xattr_entry) +
- OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_ROOT_SIZE) {
- ret = -ENOSPC;
- goto out;
+out:
+ return rc;
+}
+
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
+{
+ int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+ char *nameval_buf;
+
+ nameval_buf = ocfs2_xa_offset_pointer(loc,
+ le16_to_cpu(loc->xl_entry->xe_name_offset));
+ memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
+
+/*
+ * Take an existing entry and make it ready for the new value. This
+ * won't allocate space, but it may free space. It should be ready for
+ * ocfs2_xa_prepare_entry() to finish the work.
+ */
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+ unsigned int orig_clusters;
+ char *nameval_buf;
+ int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
+ int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
+
+ BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+ name_size);
+
+ nameval_buf = ocfs2_xa_offset_pointer(loc,
+ le16_to_cpu(loc->xl_entry->xe_name_offset));
+ if (xe_local) {
+ memset(nameval_buf + name_size, 0,
+ namevalue_size_xe(loc->xl_entry) - name_size);
+ if (!xi_local)
+ ocfs2_xa_install_value_root(loc);
+ } else {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ if (xi_local) {
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc < 0)
+ mlog_errno(rc);
+ else
+ memset(nameval_buf + name_size, 0,
+ namevalue_size_xe(loc->xl_entry) -
+ name_size);
+ } else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+ xi->xi_value_len) {
+ rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+ ctxt);
+ if (rc < 0)
+ mlog_errno(rc);
}
- size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
- xi_l.value = (void *)&def_xv;
- xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
- } else if (xi->value) {
- if (free < sizeof(struct ocfs2_xattr_entry) +
- OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(xi->value_len)) {
- ret = -ENOSPC;
+
+ if (rc) {
+ ocfs2_xa_cleanup_value_truncate(loc, "reusing",
+ orig_clusters);
goto out;
}
}
- if (!xs->not_found) {
- /* For existing extended attribute */
- size_t size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
- size_t offs = le16_to_cpu(xs->here->xe_name_offset);
- void *val = xs->base + offs;
+ loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+ ocfs2_xattr_set_local(loc->xl_entry, xi_local);
- if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
- /* Replace existing local xattr with tree root */
- ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
- ctxt, &vb, offs);
- if (ret < 0)
- mlog_errno(ret);
- goto out;
- } else if (!ocfs2_xattr_is_local(xs->here)) {
- /* For existing xattr which has value outside */
- vb.vb_xv = (struct ocfs2_xattr_value_root *)
- (val + OCFS2_XATTR_SIZE(name_len));
+out:
+ return rc;
+}
- if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
- /*
- * If new value need set outside also,
- * first truncate old value to new value,
- * then set new value with set_value_outside().
- */
- ret = ocfs2_xattr_value_truncate(inode,
- &vb,
- xi->value_len,
- ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
+/*
+ * Prepares loc->xl_entry to receive the new xattr. This includes
+ * properly setting up the name+value pair region. If loc->xl_entry
+ * already exists, it will take care of modifying it appropriately.
+ *
+ * Note that this modifies the data. You did journal_access already,
+ * right?
+ */
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ u32 name_hash,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ unsigned int orig_clusters;
+ __le64 orig_value_size = 0;
- ret = ocfs2_xattr_update_entry(inode,
- handle,
- xi,
- xs,
- &vb,
- offs);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
+ rc = ocfs2_xa_check_space(loc, xi);
+ if (rc)
+ goto out;
- ret = __ocfs2_xattr_set_value_outside(inode,
- handle,
- vb.vb_xv,
- xi->value,
- xi->value_len);
- if (ret < 0)
- mlog_errno(ret);
+ if (loc->xl_entry) {
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
goto out;
- } else {
- /*
- * If new value need set in local,
- * just trucate old value to zero.
- */
- ret = ocfs2_xattr_value_truncate(inode,
- &vb,
- 0,
- ctxt);
- if (ret < 0)
- mlog_errno(ret);
+ goto alloc_value;
+ }
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
+ }
+ }
+ ocfs2_xa_wipe_namevalue(loc);
+ } else
+ ocfs2_xa_add_entry(loc, name_hash);
+
+ /*
+ * If we get here, we have a blank entry. Fill it. We grow our
+ * name+value pair back from the end.
+ */
+ ocfs2_xa_add_namevalue(loc, xi);
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+ ocfs2_xa_install_value_root(loc);
+
+alloc_value:
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+ if (rc < 0) {
+ ctxt->set_abort = 1;
+ ocfs2_xa_cleanup_value_truncate(loc, "growing",
+ orig_clusters);
+ /*
+ * If we were growing an existing value,
+ * ocfs2_xa_cleanup_value_truncate() won't remove
+ * the entry. We need to restore the original value
+ * size.
+ */
+ if (loc->xl_entry) {
+ BUG_ON(!orig_value_size);
+ loc->xl_entry->xe_value_size = orig_value_size;
}
+ mlog_errno(rc);
}
}
- ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+out:
+ return rc;
+}
+
+/*
+ * Store the value portion of the name+value pair. This will skip
+ * values that are stored externally. Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
+ */
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int rc = 0;
+ int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+ int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+ char *nameval_buf;
+ struct ocfs2_xattr_value_buf vb;
+
+ nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ ocfs2_xa_fill_value_buf(loc, &vb);
+ rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+ ctxt->handle, &vb,
+ xi->xi_value,
+ xi->xi_value_len);
+ } else
+ memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+
+ return rc;
+}
+
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+ xi->xi_name_len);
+
+ ret = ocfs2_xa_journal_access(ctxt->handle, loc,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- if (!(flag & OCFS2_INLINE_XATTR_FL)) {
- ret = vb.vb_access(handle, inode, vb.vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
/*
- * Set value in local, include set tree root in local.
- * This is the first step for value size >INLINE_SIZE.
+ * From here on out, everything is going to modify the buffer a
+ * little. Errors are going to leave the xattr header in a
+ * sane state. Thus, even with errors we dirty the sucker.
*/
- ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
- if (!(flag & OCFS2_INLINE_XATTR_FL)) {
- ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
- if (ret < 0) {
+ /* Don't worry, we are never called with !xi_value and !xl_entry */
+ if (!xi->xi_value) {
+ ret = ocfs2_xa_remove(loc, ctxt);
+ goto out_dirty;
+ }
+
+ ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
+ if (ret) {
+ if (ret != -ENOSPC)
mlog_errno(ret);
- goto out;
- }
+ goto out_dirty;
}
- if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
- (flag & OCFS2_INLINE_XATTR_FL)) {
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- unsigned int xattrsize = osb->s_xattr_inline_size;
+ ret = ocfs2_xa_store_value(loc, xi, ctxt);
+ if (ret)
+ mlog_errno(ret);
- /*
- * Adjust extent record count or inline data size
- * to reserve space for extended attribute.
- */
- if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- struct ocfs2_inline_data *idata = &di->id2.i_data;
- le16_add_cpu(&idata->id_count, -xattrsize);
- } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
- struct ocfs2_extent_list *el = &di->id2.i_list;
- le16_add_cpu(&el->l_count, -(xattrsize /
- sizeof(struct ocfs2_extent_rec)));
- }
- di->i_xattr_inline_size = cpu_to_le16(xattrsize);
- }
- /* Update xattr flag */
- spin_lock(&oi->ip_lock);
- oi->ip_dyn_features |= flag;
- di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
- spin_unlock(&oi->ip_lock);
+out_dirty:
+ ocfs2_xa_journal_dirty(ctxt->handle, loc);
- ret = ocfs2_journal_dirty(handle, xs->inode_bh);
- if (ret < 0)
+out:
+ return ret;
+}
+
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_entry *entry)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+
+ loc->xl_inode = inode;
+ loc->xl_ops = &ocfs2_xa_block_loc_ops;
+ loc->xl_storage = bh;
+ loc->xl_entry = entry;
+ loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+ loc->xl_header =
+ (struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+ loc->xl_size);
+}
+
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_entry *entry)
+{
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)bh->b_data;
+
+ BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+
+ loc->xl_inode = inode;
+ loc->xl_ops = &ocfs2_xa_block_loc_ops;
+ loc->xl_storage = bh;
+ loc->xl_header = &(xb->xb_attrs.xb_header);
+ loc->xl_entry = entry;
+ loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_header);
+}
+
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+ struct ocfs2_xattr_bucket *bucket,
+ struct ocfs2_xattr_entry *entry)
+{
+ loc->xl_inode = bucket->bu_inode;
+ loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+ loc->xl_storage = bucket;
+ loc->xl_header = bucket_xh(bucket);
+ loc->xl_entry = entry;
+ loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
+
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_alloc_context **meta_ac,
+ int *ref_credits)
+{
+ int ret, meta_add = 0;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+
+ *ref_credits = 0;
+ ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+ &num_clusters,
+ &xv->xr_list,
+ &ext_flags);
+ if (ret) {
mlog_errno(ret);
+ goto out;
+ }
- if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
- /*
- * Set value outside in B tree.
- * This is the second step for value size > INLINE_SIZE.
- */
- size_t offs = le16_to_cpu(xs->here->xe_name_offset);
- ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
- &vb, offs);
- if (ret < 0) {
- int ret2;
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+ goto out;
- mlog_errno(ret);
- /*
- * If set value outside failed, we have to clean
- * the junk tree root we have already set in local.
- */
- ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
- xi, xs, &vb, offs);
- if (ret2 < 0)
- mlog_errno(ret2);
- }
+ ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+ ref_root_bh, xv,
+ &meta_add, ref_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
+
+ ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+ meta_add, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
out:
return ret;
}
static int ocfs2_remove_value_outside(struct inode*inode,
struct ocfs2_xattr_value_buf *vb,
- struct ocfs2_xattr_header *header)
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
- int ret = 0, i;
+ int ret = 0, i, ref_credits;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ void *val;
ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
- ctxt.handle = ocfs2_start_trans(osb,
- ocfs2_remove_extent_credits(osb->sb));
- if (IS_ERR(ctxt.handle)) {
- ret = PTR_ERR(ctxt.handle);
- mlog_errno(ret);
- goto out;
- }
-
for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
- if (!ocfs2_xattr_is_local(entry)) {
- void *val;
+ if (ocfs2_xattr_is_local(entry))
+ continue;
- val = (void *)header +
- le16_to_cpu(entry->xe_name_offset);
- vb->vb_xv = (struct ocfs2_xattr_value_root *)
- (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
- ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- break;
- }
+ val = (void *)header +
+ le16_to_cpu(entry->xe_name_offset);
+ vb->vb_xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+ ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+ ref_ci, ref_root_bh,
+ &ctxt.meta_ac,
+ &ref_credits);
+
+ ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+ ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
}
+
+ ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac) {
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ctxt.meta_ac = NULL;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
}
- ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &ctxt.dealloc);
-out:
return ret;
}
static int ocfs2_xattr_ibody_remove(struct inode *inode,
- struct buffer_head *di_bh)
+ struct buffer_head *di_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1758,13 +2421,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
((void *)di + inode->i_sb->s_blocksize -
le16_to_cpu(di->i_xattr_inline_size));
- ret = ocfs2_remove_value_outside(inode, &vb, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header,
+ ref_ci, ref_root_bh);
return ret;
}
+struct ocfs2_rm_xattr_bucket_para {
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+};
+
static int ocfs2_xattr_block_remove(struct inode *inode,
- struct buffer_head *blk_bh)
+ struct buffer_head *blk_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
struct ocfs2_xattr_block *xb;
int ret = 0;
@@ -1772,19 +2443,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
.vb_bh = blk_bh,
.vb_access = ocfs2_journal_access_xb,
};
+ struct ocfs2_rm_xattr_bucket_para args = {
+ .ref_ci = ref_ci,
+ .ref_root_bh = ref_root_bh,
+ };
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
- ret = ocfs2_remove_value_outside(inode, &vb, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header,
+ ref_ci, ref_root_bh);
} else
- ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+ ret = ocfs2_iterate_xattr_index_block(inode,
+ blk_bh,
+ ocfs2_rm_xattr_cluster,
+ &args);
return ret;
}
static int ocfs2_xattr_free_block(struct inode *inode,
- u64 block)
+ u64 block,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh)
{
struct inode *xb_alloc_inode;
struct buffer_head *xb_alloc_bh = NULL;
@@ -1802,7 +2483,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
goto out;
}
- ret = ocfs2_xattr_block_remove(inode, blk_bh);
+ ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1811,7 +2492,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
blk = le64_to_cpu(xb->xb_blkno);
bit = le16_to_cpu(xb->xb_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (xb->xb_suballoc_loc)
+ bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
xb_alloc_inode = ocfs2_get_system_file_inode(osb,
EXTENT_ALLOC_SYSTEM_INODE,
@@ -1862,6 +2546,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_caching_info *ref_ci = NULL;
handle_t *handle;
int ret;
@@ -1871,8 +2558,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
return 0;
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+ ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ref_ci = &ref_tree->rf_ci;
+
+ }
+
if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
- ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+ ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+ ref_ci, ref_root_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1881,7 +2581,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
if (di->i_xattr_loc) {
ret = ocfs2_xattr_free_block(inode,
- le64_to_cpu(di->i_xattr_loc));
+ le64_to_cpu(di->i_xattr_loc),
+ ref_ci, ref_root_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1895,7 +2596,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -1908,13 +2609,15 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+ brelse(ref_root_bh);
return ret;
}
@@ -1995,6 +2698,53 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
return 0;
}
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int xattrsize = osb->s_xattr_inline_size;
+
+ if (!ocfs2_xattr_has_space_inline(inode, di)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Adjust extent record count or inline data size
+ * to reserve space for extended attribute.
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ le16_add_cpu(&idata->id_count, -xattrsize);
+ } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(xattrsize /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ ocfs2_journal_dirty(ctxt->handle, di_bh);
+
+out:
+ return ret;
+}
+
/*
* ocfs2_xattr_ibody_set()
*
@@ -2006,23 +2756,33 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
struct ocfs2_xattr_search *xs,
struct ocfs2_xattr_set_ctxt *ctxt)
{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
int ret;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_xa_loc loc;
if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
return -ENOSPC;
down_write(&oi->ip_alloc_sem);
if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
- if (!ocfs2_xattr_has_space_inline(inode, di)) {
- ret = -ENOSPC;
+ ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
goto out;
}
}
- ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
- (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+ ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+ xs->not_found ? NULL : xs->here);
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ xs->here = loc.xl_entry;
+
out:
up_write(&oi->ip_alloc_sem);
@@ -2082,6 +2842,92 @@ cleanup:
return ret;
}
+static int ocfs2_create_xattr_block(struct inode *inode,
+ struct buffer_head *inode_bh,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ int indexed,
+ struct buffer_head **ret_bh)
+{
+ int ret;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 suballoc_loc, first_blkno;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_xattr_block *xblk;
+
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+ inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+ &suballoc_loc, &suballoc_bit_start,
+ &num_got, &first_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto end;
+ }
+
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+ ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
+ new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto end;
+ }
+
+ /* Initialize ocfs2_xattr_block */
+ xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+ memset(xblk, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+ xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+ xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
+ xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ xblk->xb_fs_generation =
+ cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
+ xblk->xb_blkno = cpu_to_le64(first_blkno);
+ if (indexed) {
+ struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+ xr->xt_clusters = cpu_to_le32(1);
+ xr->xt_last_eb_blk = 0;
+ xr->xt_list.l_tree_depth = 0;
+ xr->xt_list.l_count = cpu_to_le16(
+ ocfs2_xattr_recs_per_xb(inode->i_sb));
+ xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+ xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+ }
+ ocfs2_journal_dirty(ctxt->handle, new_bh);
+
+ /* Add it to the inode */
+ di->i_xattr_loc = cpu_to_le64(first_blkno);
+
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+ di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+ ocfs2_journal_dirty(ctxt->handle, inode_bh);
+
+ *ret_bh = new_bh;
+ new_bh = NULL;
+
+end:
+ brelse(new_bh);
+ return ret;
+}
+
/*
* ocfs2_xattr_block_set()
*
@@ -2094,82 +2940,47 @@ static int ocfs2_xattr_block_set(struct inode *inode,
struct ocfs2_xattr_set_ctxt *ctxt)
{
struct buffer_head *new_bh = NULL;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
- handle_t *handle = ctxt->handle;
struct ocfs2_xattr_block *xblk = NULL;
- u16 suballoc_bit_start;
- u32 num_got;
- u64 first_blkno;
int ret;
+ struct ocfs2_xa_loc loc;
if (!xs->xattr_bh) {
- ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
- mlog_errno(ret);
- goto end;
- }
-
- ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
- &suballoc_bit_start, &num_got,
- &first_blkno);
- if (ret < 0) {
- mlog_errno(ret);
- goto end;
- }
-
- new_bh = sb_getblk(inode->i_sb, first_blkno);
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
-
- ret = ocfs2_journal_access_xb(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
+ ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
+ 0, &new_bh);
+ if (ret) {
mlog_errno(ret);
goto end;
}
- /* Initialize ocfs2_xattr_block */
xs->xattr_bh = new_bh;
- xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
- memset(xblk, 0, inode->i_sb->s_blocksize);
- strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
- xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
- xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
- xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
- xblk->xb_blkno = cpu_to_le64(first_blkno);
-
+ xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
xs->header = &xblk->xb_attrs.xb_header;
xs->base = (void *)xs->header;
xs->end = (void *)xblk + inode->i_sb->s_blocksize;
xs->here = xs->header->xh_entries;
-
- ret = ocfs2_journal_dirty(handle, new_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto end;
- }
- di->i_xattr_loc = cpu_to_le64(first_blkno);
- ocfs2_journal_dirty(handle, xs->inode_bh);
} else
xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
- /* Set extended attribute into external block */
- ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
- OCFS2_HAS_XATTR_FL);
- if (!ret || ret != -ENOSPC)
- goto end;
+ ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
+ xs->not_found ? NULL : xs->here);
- ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
- if (ret)
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (!ret)
+ xs->here = loc.xl_entry;
+ else if ((ret != -ENOSPC) || ctxt->set_abort)
goto end;
+ else {
+ ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+ if (ret)
+ goto end;
+ }
}
- ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+ if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+ ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
end:
-
return ret;
}
@@ -2178,7 +2989,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs)
{
- u64 value_size;
struct ocfs2_xattr_entry *last;
int free, i;
size_t min_offs = xs->end - xs->base;
@@ -2201,13 +3011,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
BUG_ON(!xs->not_found);
- if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
- value_size = OCFS2_XATTR_ROOT_SIZE;
- else
- value_size = OCFS2_XATTR_SIZE(xi->value_len);
-
- if (free >= sizeof(struct ocfs2_xattr_entry) +
- OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+ if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
return 1;
return 0;
@@ -2231,7 +3035,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
char *base = NULL;
int name_offset, name_len = 0;
u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
- xi->value_len);
+ xi->xi_value_len);
u64 value_size;
/*
@@ -2239,18 +3043,17 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
* No matter whether we replace an old one or add a new one,
* we need this for writing.
*/
- if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
credits += new_clusters *
ocfs2_clusters_to_blocks(inode->i_sb, 1);
if (xis->not_found && xbs->not_found) {
credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
clusters_add += new_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
}
goto meta_guess;
@@ -2272,7 +3075,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
old_in_xb = 1;
if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
bucket_xh(xbs->bucket),
i, &block_off,
&name_offset);
@@ -2291,7 +3094,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
* The credits for removing the value tree will be extended
* by ocfs2_remove_extent itself.
*/
- if (!xi->value) {
+ if (!xi->xi_value) {
if (!ocfs2_xattr_is_local(xe))
credits += ocfs2_remove_extent_credits(inode->i_sb);
@@ -2315,13 +3118,12 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (!ocfs2_xattr_is_local(xe))
credits += ocfs2_calc_extend_credits(
inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
goto out;
}
}
- if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
/* the new values will be stored outside. */
u32 old_clusters = 0;
@@ -2341,9 +3143,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
clusters_add += new_clusters - old_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &xv->xr_list,
- new_clusters -
- old_clusters);
+ &xv->xr_list);
if (value_size >= OCFS2_XATTR_ROOT_SIZE)
goto out;
}
@@ -2354,9 +3154,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
* value, we don't need any allocation, otherwise we have
* to guess metadata allocation.
*/
- if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+ if ((ocfs2_xattr_is_local(xe) &&
+ (value_size >= xi->xi_value_len)) ||
(!ocfs2_xattr_is_local(xe) &&
- OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+ OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
goto out;
}
@@ -2388,7 +3189,7 @@ meta_guess:
&xb->xb_attrs.xb_root.xt_list;
meta_add += ocfs2_extend_meta_needed(el);
credits += ocfs2_calc_extend_credits(inode->i_sb,
- el, 1);
+ el);
} else
credits += OCFS2_SUBALLOC_ALLOC + 1;
@@ -2407,8 +3208,15 @@ meta_guess:
clusters_add += 1;
}
} else {
- meta_add += 1;
credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el);
+ } else {
+ meta_add += 1;
+ }
}
out:
if (clusters_need)
@@ -2427,6 +3235,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
struct ocfs2_xattr_search *xis,
struct ocfs2_xattr_search *xbs,
struct ocfs2_xattr_set_ctxt *ctxt,
+ int extra_meta,
int *credits)
{
int clusters_add, meta_add, ret;
@@ -2443,8 +3252,9 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
return ret;
}
- mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
- "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+ meta_add += extra_meta;
+ trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+ clusters_add, *credits);
if (meta_add) {
ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2484,7 +3294,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
{
int ret = 0, credits, old_found;
- if (!xi->value) {
+ if (!xi->xi_value) {
/* Remove existing extended attribute */
if (!xis->not_found)
ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2498,8 +3308,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
* If succeed and that extended attribute existing in
* external block, then we will remove it.
*/
- xi->value = NULL;
- xi->value_len = 0;
+ xi->xi_value = NULL;
+ xi->xi_value_len = 0;
old_found = xis->not_found;
xis->not_found = -ENODATA;
@@ -2517,18 +3327,17 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
- } else if (ret == -ENOSPC) {
+ } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
if (di->i_xattr_loc && !xbs->xattr_bh) {
ret = ocfs2_xattr_block_find(inode,
- xi->name_index,
- xi->name, xbs);
+ xi->xi_name_index,
+ xi->xi_name, xbs);
if (ret)
goto out;
@@ -2548,8 +3357,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2567,8 +3375,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
* If succeed and that extended attribute
* existing in inode, we will remove it.
*/
- xi->value = NULL;
- xi->value_len = 0;
+ xi->xi_value = NULL;
+ xi->xi_value_len = 0;
xbs->not_found = -ENODATA;
ret = ocfs2_calc_xattr_set_need(inode,
di,
@@ -2583,8 +3391,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2597,7 +3404,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
if (!ret) {
/* Update inode ctime. */
- ret = ocfs2_journal_access_di(ctxt->handle, inode,
+ ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
xis->inode_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2634,10 +3441,11 @@ int ocfs2_xattr_set_handle(handle_t *handle,
int ret;
struct ocfs2_xattr_info xi = {
- .name_index = name_index,
- .name = name,
- .value = value,
- .value_len = value_len,
+ .xi_name_index = name_index,
+ .xi_name = name,
+ .xi_name_len = strlen(name),
+ .xi_value = value,
+ .xi_value_len = value_len,
};
struct ocfs2_xattr_search xis = {
@@ -2710,16 +3518,18 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret, credits;
+ int ret, credits, ref_meta = 0, ref_credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
- struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
+ struct ocfs2_refcount_tree *ref_tree = NULL;
struct ocfs2_xattr_info xi = {
- .name_index = name_index,
- .name = name,
- .value = value,
- .value_len = value_len,
+ .xi_name_index = name_index,
+ .xi_name = name,
+ .xi_name_len = strlen(name),
+ .xi_value = value,
+ .xi_value_len = value_len,
};
struct ocfs2_xattr_search xis = {
@@ -2754,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode,
down_write(&OCFS2_I(inode)->ip_xattr_sem);
/*
* Scan inode and external block to find the same name
- * extended attribute and collect search infomation.
+ * extended attribute and collect search information.
*/
ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
if (ret)
@@ -2778,6 +3588,17 @@ int ocfs2_xattr_set(struct inode *inode,
goto cleanup;
}
+ /* Check whether the value is refcounted and do some preparation. */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+ (!xis.not_found || !xbs.not_found)) {
+ ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+ &xis, &xbs, &ref_tree,
+ &ref_meta, &ref_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
mutex_lock(&tl_inode->i_mutex);
@@ -2792,7 +3613,7 @@ int ocfs2_xattr_set(struct inode *inode,
mutex_unlock(&tl_inode->i_mutex);
ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
- &xbs, &ctxt, &credits);
+ &xbs, &ctxt, ref_meta, &credits);
if (ret) {
mlog_errno(ret);
goto cleanup;
@@ -2800,17 +3621,19 @@ int ocfs2_xattr_set(struct inode *inode,
/* we need to update inode's ctime field, so add credit for it. */
credits += OCFS2_INODE_UPDATE_CREDITS;
- ctxt.handle = ocfs2_start_trans(osb, credits);
+ ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
if (IS_ERR(ctxt.handle)) {
ret = PTR_ERR(ctxt.handle);
mlog_errno(ret);
- goto cleanup;
+ goto out_free_ac;
}
ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+ ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
ocfs2_commit_trans(osb, ctxt.handle);
+out_free_ac:
if (ctxt.data_ac)
ocfs2_free_alloc_context(ctxt.data_ac);
if (ctxt.meta_ac)
@@ -2818,8 +3641,16 @@ int ocfs2_xattr_set(struct inode *inode,
if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
cleanup:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ if (!value && !ret) {
+ ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+ if (ret)
+ mlog_errno(ret);
+ }
ocfs2_inode_unlock(inode, 1);
cleanup_nolock:
brelse(di_bh);
@@ -2848,7 +3679,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
u64 e_blkno = 0;
if (el->l_tree_depth) {
- ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+ ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
+ &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2930,7 +3762,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
if (cmp)
continue;
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
xh,
i,
&block_off,
@@ -3064,8 +3896,10 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
if (found) {
xs->here = &xs->header->xh_entries[index];
- mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
- (unsigned long long)bucket_blkno(xs->bucket), index);
+ trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+ name, name_index, name_hash,
+ (unsigned long long)bucket_blkno(xs->bucket),
+ index);
} else
ret = -ENODATA;
@@ -3092,8 +3926,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
if (le16_to_cpu(el->l_next_free_rec) == 0)
return -ENODATA;
- mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
- name, name_hash, name_index);
+ trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+ name, name_index, name_hash,
+ (unsigned long long)root_bh->b_blocknr,
+ -1);
ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
&num_clusters, el);
@@ -3104,9 +3940,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
- mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
- "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
- first_hash);
+ trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+ name, name_index, first_hash,
+ (unsigned long long)p_blkno,
+ num_clusters);
ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
p_blkno, first_hash, num_clusters, xs);
@@ -3132,8 +3969,9 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
return -ENOMEM;
}
- mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
- clusters, (unsigned long long)blkno);
+ trace_ocfs2_iterate_xattr_buckets(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno, clusters);
for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
ret = ocfs2_read_xattr_bucket(bucket, blkno);
@@ -3149,12 +3987,11 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
if (i == 0)
num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
- mlog(0, "iterating xattr bucket %llu, first hash %u\n",
- (unsigned long long)blkno,
+ trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
if (func) {
ret = func(inode, bucket, para);
- if (ret)
+ if (ret && ret != -ERANGE)
mlog_errno(ret);
/* Fall through to bucket_relse() */
}
@@ -3174,7 +4011,7 @@ struct ocfs2_xattr_tree_list {
size_t result;
};
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
struct ocfs2_xattr_header *xh,
int index,
int *block_off,
@@ -3187,8 +4024,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
- *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
- *new_offset = name_offset % inode->i_sb->s_blocksize;
+ *block_off = name_offset >> sb->s_blocksize_bits;
+ *new_offset = name_offset % sb->s_blocksize;
return 0;
}
@@ -3208,7 +4045,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
prefix = ocfs2_xattr_prefix(type);
if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode,
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
bucket_xh(bucket),
i,
&block_off,
@@ -3231,22 +4068,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
- struct ocfs2_xattr_tree_root *xt,
- char *buffer,
- size_t buffer_size)
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ xattr_tree_rec_func *rec_func,
+ void *para)
{
- struct ocfs2_extent_list *el = &xt->xt_list;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
int ret = 0;
u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
u64 p_blkno = 0;
- struct ocfs2_xattr_tree_list xl = {
- .buffer = buffer,
- .buffer_size = buffer_size,
- .result = 0,
- };
- if (le16_to_cpu(el->l_next_free_rec) == 0)
+ if (!el->l_next_free_rec || !rec_func)
return 0;
while (name_hash > 0) {
@@ -3254,15 +4088,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
&e_cpos, &num_clusters, el);
if (ret) {
mlog_errno(ret);
- goto out;
+ break;
}
- ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
- ocfs2_list_xattr_bucket,
- &xl);
+ ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+ num_clusters, para);
if (ret) {
- mlog_errno(ret);
- goto out;
+ if (ret != -ERANGE)
+ mlog_errno(ret);
+ break;
}
if (e_cpos == 0)
@@ -3271,6 +4105,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
name_hash = e_cpos - 1;
}
+ return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para)
+{
+ return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct buffer_head *blk_bh,
+ char *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct ocfs2_xattr_tree_list xl = {
+ .buffer = buffer,
+ .buffer_size = buffer_size,
+ .result = 0,
+ };
+
+ ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+ ocfs2_list_xattr_tree_rec, &xl);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = xl.result;
out:
return ret;
@@ -3322,9 +4187,9 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
char *src = xb_bh->b_data;
char *target = bucket_block(bucket, blks - 1);
- mlog(0, "cp xattr from block %llu to bucket %llu\n",
- (unsigned long long)xb_bh->b_blocknr,
- (unsigned long long)bucket_blkno(bucket));
+ trace_ocfs2_cp_xattr_block_to_bucket_begin(
+ (unsigned long long)xb_bh->b_blocknr,
+ (unsigned long long)bucket_blkno(bucket));
for (i = 0; i < blks; i++)
memset(bucket_block(bucket, i), 0, blocksize);
@@ -3360,8 +4225,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
for (i = 0; i < count; i++)
le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
- mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
- offset, size, off_change);
+ trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
cmp_xe, swap_xe);
@@ -3403,7 +4267,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
u32 bit_off, len;
u64 blkno;
handle_t *handle = ctxt->handle;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *xb_bh = xs->xattr_bh;
struct ocfs2_xattr_block *xb =
@@ -3411,8 +4274,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
struct ocfs2_xattr_tree_root *xr;
u16 xb_flags = le16_to_cpu(xb->xb_flags);
- mlog(0, "create xattr index block for %llu\n",
- (unsigned long long)xb_bh->b_blocknr);
+ trace_ocfs2_xattr_create_index_block_begin(
+ (unsigned long long)xb_bh->b_blocknr);
BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
BUG_ON(!xs->bucket);
@@ -3424,14 +4287,14 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
*/
down_write(&oi->ip_alloc_sem);
- ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
1, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
@@ -3445,10 +4308,9 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
*/
blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- mlog(0, "allocate 1 cluster from %llu to xattr block\n",
- (unsigned long long)blkno);
+ trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
- ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3515,7 +4377,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
struct ocfs2_xattr_bucket *bucket)
{
int ret, i;
- size_t end, offset, len, value_len;
+ size_t end, offset, len;
struct ocfs2_xattr_header *xh;
char *entries, *buf, *bucket_buf = NULL;
u64 blkno = bucket_blkno(bucket);
@@ -3550,8 +4412,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
entries = (char *)xh->xh_entries;
xh_free_start = le16_to_cpu(xh->xh_free_start);
- mlog(0, "adjust xattr bucket in %llu, count = %u, "
- "xh_free_start = %u, xh_name_value_len = %u.\n",
+ trace_ocfs2_defrag_xattr_bucket(
(unsigned long long)blkno, le16_to_cpu(xh->xh_count),
xh_free_start, le16_to_cpu(xh->xh_name_value_len));
@@ -3569,12 +4430,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
end = OCFS2_XATTR_BUCKET_SIZE;
for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
offset = le16_to_cpu(xe->xe_name_offset);
- if (ocfs2_xattr_is_local(xe))
- value_len = OCFS2_XATTR_SIZE(
- le64_to_cpu(xe->xe_value_size));
- else
- value_len = OCFS2_XATTR_ROOT_SIZE;
- len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+ len = namevalue_size_xe(xe);
/*
* We must make sure that the name/value pair
@@ -3658,8 +4514,9 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
- mlog(0, "move half of xattrs in cluster %llu to %llu\n",
- (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
+ trace_ocfs2_mv_xattr_bucket_cross_cluster(
+ (unsigned long long)last_cluster_blkno,
+ (unsigned long long)new_blkno);
ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
last_cluster_blkno, new_blkno,
@@ -3763,14 +4620,14 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
int new_bucket_head)
{
int ret, i;
- int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
+ int count, start, len, name_value_len = 0, name_offset = 0;
struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
struct ocfs2_xattr_header *xh;
struct ocfs2_xattr_entry *xe;
int blocksize = inode->i_sb->s_blocksize;
- mlog(0, "move some of xattrs from bucket %llu to %llu\n",
- (unsigned long long)blk, (unsigned long long)new_blk);
+ trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+ (unsigned long long)new_blk);
s_bucket = ocfs2_xattr_bucket_new(inode);
t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -3797,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
* Even if !new_bucket_head, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3854,13 +4711,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
name_value_len = 0;
for (i = 0; i < start; i++) {
xe = &xh->xh_entries[i];
- xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
- if (ocfs2_xattr_is_local(xe))
- xe_len +=
- OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
- else
- xe_len += OCFS2_XATTR_ROOT_SIZE;
- name_value_len += xe_len;
+ name_value_len += namevalue_size_xe(xe);
if (le16_to_cpu(xe->xe_name_offset) < name_offset)
name_offset = le16_to_cpu(xe->xe_name_offset);
}
@@ -3875,9 +4726,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
*/
xe = &xh->xh_entries[start];
len = sizeof(struct ocfs2_xattr_entry) * (count - start);
- mlog(0, "mv xattr entry len %d from %d to %d\n", len,
- (int)((char *)xe - (char *)xh),
- (int)((char *)xh->xh_entries - (char *)xh));
+ trace_ocfs2_divide_xattr_bucket_move(len,
+ (int)((char *)xe - (char *)xh),
+ (int)((char *)xh->xh_entries - (char *)xh));
memmove((char *)xh->xh_entries, (char *)xe, len);
xe = &xh->xh_entries[count - start];
len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -3890,12 +4741,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
xe = &xh->xh_entries[i];
- xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
- if (ocfs2_xattr_is_local(xe))
- xe_len +=
- OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
- else
- xe_len += OCFS2_XATTR_ROOT_SIZE;
if (le16_to_cpu(xe->xe_name_offset) <
le16_to_cpu(xh->xh_free_start))
xh->xh_free_start = xe->xe_name_offset;
@@ -3955,9 +4800,9 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
BUG_ON(s_blkno == t_blkno);
- mlog(0, "cp bucket %llu to %llu, target is %d\n",
- (unsigned long long)s_blkno, (unsigned long long)t_blkno,
- t_is_new);
+ trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+ (unsigned long long)t_blkno,
+ t_is_new);
s_bucket = ocfs2_xattr_bucket_new(inode);
t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -3975,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
* Even if !t_is_new, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
if (ret)
goto out;
@@ -4029,8 +4874,8 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
struct ocfs2_xattr_bucket *old_first, *new_first;
- mlog(0, "mv xattrs from cluster %llu to %llu\n",
- (unsigned long long)last_blk, (unsigned long long)to_blk);
+ trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+ (unsigned long long)to_blk);
BUG_ON(start_bucket >= num_buckets);
if (start_bucket) {
@@ -4058,8 +4903,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
* We need to update the first bucket of the old extent and all
* the buckets going to the new extent.
*/
- credits = ((num_buckets + 1) * blks_per_bucket) +
- handle->h_buffer_credits;
+ credits = ((num_buckets + 1) * blks_per_bucket);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -4129,7 +4973,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
u32 *first_hash)
{
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+ int ret, credits = 2 * blk_per_bucket;
BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -4181,9 +5025,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
{
int ret;
- mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
- (unsigned long long)bucket_blkno(first), prev_clusters,
- (unsigned long long)new_blk);
+ trace_ocfs2_adjust_xattr_cross_cluster(
+ (unsigned long long)bucket_blkno(first),
+ (unsigned long long)new_blk, prev_clusters);
if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -4256,21 +5100,21 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
- mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
- "previous xattr blkno = %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- prev_cpos, (unsigned long long)bucket_blkno(first));
+ trace_ocfs2_add_new_xattr_cluster_begin(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)bucket_blkno(first),
+ prev_cpos, prev_clusters);
- ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+ ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
- ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
- ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -4281,8 +5125,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
BUG_ON(num_bits > clusters_to_add);
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
(prev_clusters + num_bits) << osb->s_clustersize_bits <=
@@ -4298,8 +5141,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
*/
v_start = prev_cpos + prev_clusters;
*num_clusters = prev_clusters + num_bits;
- mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
- num_bits);
} else {
ret = ocfs2_adjust_xattr_cross_cluster(inode,
handle,
@@ -4315,18 +5156,16 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
}
}
- mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
- num_bits, (unsigned long long)block, v_start);
- ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+ trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+ v_start, num_bits);
+ ret = ocfs2_insert_extent(handle, &et, v_start, block,
num_bits, 0, ctxt->meta_ac);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, root_bh);
leave:
return ret;
@@ -4353,9 +5192,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
u64 end_blk;
u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
- mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
- "from %llu, len = %u\n", (unsigned long long)target_blk,
- (unsigned long long)bucket_blkno(first), num_clusters);
+ trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+ (unsigned long long)bucket_blkno(first),
+ num_clusters, new_bucket);
/* The extent must have room for an additional bucket */
BUG_ON(new_bucket >=
@@ -4371,8 +5210,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
* existing bucket. Then we add the last existing bucket, the
* new bucket, and the first bucket (3 * blk_per_bucket).
*/
- credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
- handle->h_buffer_credits;
+ credits = (end_blk - target_blk) + (3 * blk_per_bucket);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -4436,8 +5274,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
/* The bucket at the front of the extent */
struct ocfs2_xattr_bucket *first;
- mlog(0, "Add new xattr bucket starting from %llu\n",
- (unsigned long long)bucket_blkno(target));
+ trace_ocfs2_add_new_xattr_bucket(
+ (unsigned long long)bucket_blkno(target));
/* The first bucket of the original extent */
first = ocfs2_xattr_bucket_new(inode);
@@ -4507,195 +5345,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
}
/*
- * Handle the normal xattr set, including replace, delete and new.
- *
- * Note: "local" indicates the real data's locality. So we can't
- * just its bucket locality by its length.
- */
-static void ocfs2_xattr_set_entry_normal(struct inode *inode,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- u32 name_hash,
- int local)
-{
- struct ocfs2_xattr_entry *last, *xe;
- int name_len = strlen(xi->name);
- struct ocfs2_xattr_header *xh = xs->header;
- u16 count = le16_to_cpu(xh->xh_count), start;
- size_t blocksize = inode->i_sb->s_blocksize;
- char *val;
- size_t offs, size, new_size;
-
- last = &xh->xh_entries[count];
- if (!xs->not_found) {
- xe = xs->here;
- offs = le16_to_cpu(xe->xe_name_offset);
- if (ocfs2_xattr_is_local(xe))
- size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
- else
- size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
-
- /*
- * If the new value will be stored outside, xi->value has been
- * initalized as an empty ocfs2_xattr_value_root, and the same
- * goes with xi->value_len, so we can set new_size safely here.
- * See ocfs2_xattr_set_in_bucket.
- */
- new_size = OCFS2_XATTR_SIZE(name_len) +
- OCFS2_XATTR_SIZE(xi->value_len);
-
- le16_add_cpu(&xh->xh_name_value_len, -size);
- if (xi->value) {
- if (new_size > size)
- goto set_new_name_value;
-
- /* Now replace the old value with new one. */
- if (local)
- xe->xe_value_size = cpu_to_le64(xi->value_len);
- else
- xe->xe_value_size = 0;
-
- val = ocfs2_xattr_bucket_get_val(inode,
- xs->bucket, offs);
- memset(val + OCFS2_XATTR_SIZE(name_len), 0,
- size - OCFS2_XATTR_SIZE(name_len));
- if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
- memcpy(val + OCFS2_XATTR_SIZE(name_len),
- xi->value, xi->value_len);
-
- le16_add_cpu(&xh->xh_name_value_len, new_size);
- ocfs2_xattr_set_local(xe, local);
- return;
- } else {
- /*
- * Remove the old entry if there is more than one.
- * We don't remove the last entry so that we can
- * use it to indicate the hash value of the empty
- * bucket.
- */
- last -= 1;
- le16_add_cpu(&xh->xh_count, -1);
- if (xh->xh_count) {
- memmove(xe, xe + 1,
- (void *)last - (void *)xe);
- memset(last, 0,
- sizeof(struct ocfs2_xattr_entry));
- } else
- xh->xh_free_start =
- cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
-
- return;
- }
- } else {
- /* find a new entry for insert. */
- int low = 0, high = count - 1, tmp;
- struct ocfs2_xattr_entry *tmp_xe;
-
- while (low <= high && count) {
- tmp = (low + high) / 2;
- tmp_xe = &xh->xh_entries[tmp];
-
- if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
- low = tmp + 1;
- else if (name_hash <
- le32_to_cpu(tmp_xe->xe_name_hash))
- high = tmp - 1;
- else {
- low = tmp;
- break;
- }
- }
-
- xe = &xh->xh_entries[low];
- if (low != count)
- memmove(xe + 1, xe, (void *)last - (void *)xe);
-
- le16_add_cpu(&xh->xh_count, 1);
- memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
- xe->xe_name_hash = cpu_to_le32(name_hash);
- xe->xe_name_len = name_len;
- ocfs2_xattr_set_type(xe, xi->name_index);
- }
-
-set_new_name_value:
- /* Insert the new name+value. */
- size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
-
- /*
- * We must make sure that the name/value pair
- * exists in the same block.
- */
- offs = le16_to_cpu(xh->xh_free_start);
- start = offs - size;
-
- if (start >> inode->i_sb->s_blocksize_bits !=
- (offs - 1) >> inode->i_sb->s_blocksize_bits) {
- offs = offs - offs % blocksize;
- xh->xh_free_start = cpu_to_le16(offs);
- }
-
- val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
- xe->xe_name_offset = cpu_to_le16(offs - size);
-
- memset(val, 0, size);
- memcpy(val, xi->name, name_len);
- memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
-
- xe->xe_value_size = cpu_to_le64(xi->value_len);
- ocfs2_xattr_set_local(xe, local);
- xs->here = xe;
- le16_add_cpu(&xh->xh_free_start, -size);
- le16_add_cpu(&xh->xh_name_value_len, size);
-
- return;
-}
-
-/*
- * Set the xattr entry in the specified bucket.
- * The bucket is indicated by xs->bucket and it should have the enough
- * space for the xattr insertion.
- */
-static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- u32 name_hash,
- int local)
-{
- int ret;
- u64 blkno;
-
- mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
- (unsigned long)xi->value_len, xi->name_index,
- (unsigned long long)bucket_blkno(xs->bucket));
-
- if (!xs->bucket->bu_bhs[1]) {
- blkno = bucket_blkno(xs->bucket);
- ocfs2_xattr_bucket_relse(xs->bucket);
- ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
- ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-
-out:
- return ret;
-}
-
-/*
* Truncate the specified xe_off entry in xattr bucket.
* bucket is indicated by header_bh and len is the new length.
* Both the ocfs2_xattr_value_root and the entry will be updated here.
@@ -4742,8 +5391,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
* modified something. We have to assume they did, and dirty
* the whole bucket. This leaves us in a consistent state.
*/
- mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
- xe_off, (unsigned long long)bucket_blkno(bucket), len);
+ trace_ocfs2_xattr_bucket_value_truncate(
+ (unsigned long long)bucket_blkno(bucket), xe_off, len);
ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
if (ret) {
mlog_errno(ret);
@@ -4765,66 +5414,12 @@ out:
return ret;
}
-static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
- struct ocfs2_xattr_search *xs,
- int len,
- struct ocfs2_xattr_set_ctxt *ctxt)
-{
- int ret, offset;
- struct ocfs2_xattr_entry *xe = xs->here;
- struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-
- BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
-
- offset = xe - xh->xh_entries;
- ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
- offset, len, ctxt);
- if (ret)
- mlog_errno(ret);
-
- return ret;
-}
-
-static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_search *xs,
- char *val,
- int value_len)
-{
- int ret, offset, block_off;
- struct ocfs2_xattr_value_root *xv;
- struct ocfs2_xattr_entry *xe = xs->here;
- struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
- void *base;
-
- BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
-
- ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
- xe - xh->xh_entries,
- &block_off,
- &offset);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- base = bucket_block(xs->bucket, block_off);
- xv = (struct ocfs2_xattr_value_root *)(base + offset +
- OCFS2_XATTR_SIZE(xe->xe_name_len));
-
- ret = __ocfs2_xattr_set_value_outside(inode, handle,
- xv, val, value_len);
- if (ret)
- mlog_errno(ret);
-out:
- return ret;
-}
-
static int ocfs2_rm_xattr_cluster(struct inode *inode,
struct buffer_head *root_bh,
u64 blkno,
u32 cpos,
- u32 len)
+ u32 len,
+ void *para)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4836,14 +5431,23 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_extent_tree et;
- ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+ ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_delete_xattr_in_bucket, para);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
- mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
- cpos, len, (unsigned long long)blkno);
+ trace_ocfs2_rm_xattr_cluster(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno, cpos, len);
- ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+ ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
+ len);
ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
if (ret) {
@@ -4868,14 +5472,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+ ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
&dealloc);
if (ret) {
mlog_errno(ret);
@@ -4883,16 +5487,12 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
}
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
-
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, root_bh);
ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
if (ret)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -4909,394 +5509,1689 @@ out:
return ret;
}
-static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_search *xs)
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ const char *name)
{
- struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
- struct ocfs2_xattr_entry *last = &xh->xh_entries[
- le16_to_cpu(xh->xh_count) - 1];
- int ret = 0;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
- ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+ return 0;
+
+ if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+ xh->xh_entries[0].xe_name_hash) {
+ mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+ "hash = %u\n",
+ (unsigned long long)bucket_blkno(bucket),
+ le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to set the entry in the current bucket. If we fail, the caller
+ * will handle getting us another bucket.
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+ struct ocfs2_xa_loc loc;
+
+ trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
+
+ ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+ xs->not_found ? NULL : xs->here);
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (!ret) {
+ xs->here = loc.xl_entry;
+ goto out;
+ }
+ if (ret != -ENOSPC) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Ok, we need space. Let's try defragmenting the bucket. */
+ ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+ xs->bucket);
if (ret) {
mlog_errno(ret);
- return;
+ goto out;
}
- /* Remove the old entry. */
- memmove(xs->here, xs->here + 1,
- (void *)last - (void *)xs->here);
- memset(last, 0, sizeof(struct ocfs2_xattr_entry));
- le16_add_cpu(&xh->xh_count, -1);
+ ret = ocfs2_xa_set(&loc, xi, ctxt);
+ if (!ret) {
+ xs->here = loc.xl_entry;
+ goto out;
+ }
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
- ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
+
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret;
+
+ trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
+
+ ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+ if (!ret)
+ goto out;
+ if (ret != -ENOSPC) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Ack, need more space. Let's try to get another bucket! */
+
+ /*
+ * We do not allow for overlapping ranges between buckets. And
+ * the maximum number of collisions we will allow for then is
+ * one bucket's worth, so check it here whether we need to
+ * add a new bucket for the insert.
+ */
+ ret = ocfs2_check_xattr_bucket_collision(inode,
+ xs->bucket,
+ xi->xi_name);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_add_new_xattr_bucket(inode,
+ xs->xattr_bh,
+ xs->bucket,
+ ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * ocfs2_add_new_xattr_bucket() will have updated
+ * xs->bucket if it moved, but it will not have updated
+ * any of the other search fields. Thus, we drop it and
+ * re-search. Everything should be cached, so it'll be
+ * quick.
+ */
+ ocfs2_xattr_bucket_relse(xs->bucket);
+ ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+ xi->xi_name_index,
+ xi->xi_name, xs);
+ if (ret && ret != -ENODATA)
+ goto out;
+ xs->not_found = ret;
+
+ /* Ok, we have a new bucket, let's try again */
+ ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+ if (ret && (ret != -ENOSPC))
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0, ref_credits;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ u16 i;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+ int credits = ocfs2_remove_extent_credits(osb->sb) +
+ ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_rm_xattr_bucket_para *args =
+ (struct ocfs2_rm_xattr_bucket_para *)para;
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+ i, &xv, NULL);
+
+ ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+ args->ref_ci,
+ args->ref_root_bh,
+ &ctxt.meta_ac,
+ &ref_credits);
+
+ ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+ i, 0, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+ if (ctxt.meta_ac) {
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ctxt.meta_ac = NULL;
+ }
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
+ return ret;
}
/*
- * Set the xattr name/value in the bucket specified in xs.
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
*
- * As the new value in xi may be stored in the bucket or in an outside cluster,
- * we divide the whole process into 3 steps:
- * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
- * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
- * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
- * 4. If the clusters for the new outside value can't be allocated, we need
- * to free the xattr we allocated in set.
+ * Note:
+ * We have to give the extra credits for the caller.
*/
-static int ocfs2_xattr_set_in_bucket(struct inode *inode,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_set_ctxt *ctxt)
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+ handle_t *handle,
+ void *para)
{
- int ret, local = 1;
- size_t value_len;
- char *val = (char *)xi->value;
- struct ocfs2_xattr_entry *xe = xs->here;
- u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
- strlen(xi->name));
+ int ret;
+ struct ocfs2_xattr_bucket *bucket =
+ (struct ocfs2_xattr_bucket *)para;
- if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
- /*
- * We need to truncate the xattr storage first.
- *
- * If both the old and new value are stored to
- * outside block, we only need to truncate
- * the storage and then set the value outside.
- *
- * If the new value should be stored within block,
- * we should free all the outside block first and
- * the modification to the xattr block will be done
- * by following steps.
- */
- if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
- value_len = xi->value_len;
- else
- value_len = 0;
+ ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
- ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
- value_len,
- ctxt);
- if (ret)
- goto out;
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
- if (value_len)
- goto set_value_outside;
- }
+ return 0;
+}
- value_len = xi->value_len;
- /* So we have to handle the inside block change now. */
- if (value_len > OCFS2_XATTR_INLINE_SIZE) {
- /*
- * If the new value will be stored outside of block,
- * initalize a new empty value root and insert it first.
- */
- local = 0;
- xi->value = &def_xv;
- xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ * will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ * lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_refcount_tree **ref_tree,
+ int *meta_add,
+ int *credits)
+{
+ int ret = 0;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_xattr_entry *xe;
+ char *base;
+ u32 p_cluster, num_clusters;
+ unsigned int ext_flags;
+ int name_offset, name_len;
+ struct ocfs2_xattr_value_buf vb;
+ struct ocfs2_xattr_bucket *bucket = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_post_refcount refcount;
+ struct ocfs2_post_refcount *p = NULL;
+ struct buffer_head *ref_root_bh = NULL;
+
+ if (!xis->not_found) {
+ xe = xis->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ base = xis->base;
+ vb.vb_bh = xis->inode_bh;
+ vb.vb_access = ocfs2_journal_access_di;
+ } else {
+ int i, block_off = 0;
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+ xe = xbs->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ i = xbs->here - xbs->header->xh_entries;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(xbs->bucket),
+ i, &block_off,
+ &name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ base = bucket_block(xbs->bucket, block_off);
+ vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+ vb.vb_access = ocfs2_journal_access;
+
+ if (ocfs2_meta_ecc(osb)) {
+ /*create parameters for ocfs2_post_refcount. */
+ bucket = xbs->bucket;
+ refcount.credits = bucket->bu_blocks;
+ refcount.para = bucket;
+ refcount.func =
+ ocfs2_xattr_bucket_post_refcount;
+ p = &refcount;
+ }
+ } else {
+ base = xbs->base;
+ vb.vb_bh = xbs->xattr_bh;
+ vb.vb_access = ocfs2_journal_access_xb;
+ }
}
- ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
- name_hash, local);
+ if (ocfs2_xattr_is_local(xe))
+ goto out;
+
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (base + name_offset + name_len);
+
+ ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+ &num_clusters, &vb.vb_xv->xr_list,
+ &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
- if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+ /*
+ * We just need to check the 1st extent record, since we always
+ * CoW the whole xattr. So there shouldn't be a xattr with
+ * some REFCOUNT extent recs after the 1st one.
+ */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
goto out;
- /* allocate the space now for the outside block storage. */
- ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
- value_len, ctxt);
+ ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+ 1, ref_tree, &ref_root_bh);
if (ret) {
mlog_errno(ret);
+ goto out;
+ }
- if (xs->not_found) {
- /*
- * We can't allocate enough clusters for outside
- * storage and we have allocated xattr already,
- * so need to remove it.
- */
- ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
- }
+ /*
+ * If we are deleting the xattr or the new size will be stored inside,
+ * cool, leave it there, the xattr truncate process will remove them
+ * for us(it still needs the refcount tree lock and the meta, credits).
+ * And the worse case is that every cluster truncate will split the
+ * refcount tree, and make the original extent become 3. So we will need
+ * 2 * cluster more extent recs at most.
+ */
+ if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+ ret = ocfs2_refcounted_xattr_delete_need(inode,
+ &(*ref_tree)->rf_ci,
+ ref_root_bh, vb.vb_xv,
+ meta_add, credits);
+ if (ret)
+ mlog_errno(ret);
goto out;
}
-set_value_outside:
- ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
- xs, val, value_len);
+ ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+ *ref_tree, ref_root_bh, 0,
+ le32_to_cpu(vb.vb_xv->xr_clusters), p);
+ if (ret)
+ mlog_errno(ret);
+
out:
+ brelse(ref_root_bh);
return ret;
}
/*
- * check whether the xattr bucket is filled up with the same hash value.
- * If we want to insert the xattr with the same hash, return -ENOSPC.
- * If we want to insert a xattr with different hash value, go ahead
- * and ocfs2_divide_xattr_bucket will handle this.
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
*/
-static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
- struct ocfs2_xattr_bucket *bucket,
- const char *name)
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_extent_tree *value_et,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_post_refcount *refcount)
{
- struct ocfs2_xattr_header *xh = bucket_xh(bucket);
- u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+ int ret = 0;
+ u32 clusters = le32_to_cpu(xv->xr_clusters);
+ u32 cpos, p_cluster, num_clusters;
+ struct ocfs2_extent_list *el = &xv->xr_list;
+ unsigned int ext_flags;
- if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
- return 0;
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, el, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
- if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
- xh->xh_entries[0].xe_name_hash) {
- mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
- "hash = %u\n",
- (unsigned long long)bucket_blkno(bucket),
- le32_to_cpu(xh->xh_entries[0].xe_name_hash));
- return -ENOSPC;
+ cpos += num_clusters;
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+ continue;
+
+ BUG_ON(!p_cluster);
+
+ ret = ocfs2_add_refcount_flag(inode, value_et,
+ ref_ci, ref_root_bh,
+ cpos - num_clusters,
+ p_cluster, num_clusters,
+ dealloc, refcount);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
}
- return 0;
+ return ret;
}
-static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
- struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs,
- struct ocfs2_xattr_set_ctxt *ctxt)
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
{
- struct ocfs2_xattr_header *xh;
+
struct ocfs2_xattr_entry *xe;
- u16 count, header_size, xh_free_start;
- int free, max_free, need, old;
- size_t value_size = 0, name_len = strlen(xi->name);
- size_t blocksize = inode->i_sb->s_blocksize;
- int ret, allocation = 0;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_extent_tree et;
+ int i, ret = 0;
- mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+ for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+ xe = &header->xh_entries[i];
-try_again:
- xh = xs->header;
- count = le16_to_cpu(xh->xh_count);
- xh_free_start = le16_to_cpu(xh->xh_free_start);
- header_size = sizeof(struct ocfs2_xattr_header) +
- count * sizeof(struct ocfs2_xattr_entry);
- max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
- le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
+ if (ocfs2_xattr_is_local(xe))
+ continue;
- mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
- "of %u which exceed block size\n",
- (unsigned long long)bucket_blkno(xs->bucket),
- header_size);
+ xv = (struct ocfs2_xattr_value_root *)((void *)header +
+ le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
- if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
- value_size = OCFS2_XATTR_ROOT_SIZE;
- else if (xi->value)
- value_size = OCFS2_XATTR_SIZE(xi->value_len);
+ vb->vb_xv = xv;
+ ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- if (xs->not_found)
- need = sizeof(struct ocfs2_xattr_entry) +
- OCFS2_XATTR_SIZE(name_len) + value_size;
- else {
- need = value_size + OCFS2_XATTR_SIZE(name_len);
+ ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+ ref_ci, ref_root_bh,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+ (fe_bh->b_data + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = fe_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+ ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+ struct ocfs2_xattr_bucket *bucket,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **bh)
+{
+ int ret, block_off, name_offset;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+ struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+ void *base;
+
+ ret = ocfs2_xattr_bucket_get_name_value(sb,
+ bucket_xh(bucket),
+ offset,
+ &block_off,
+ &name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ base = bucket_block(bucket, block_off);
+
+ *xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ if (bh)
+ *bh = bucket->bu_bhs[block_off];
+out:
+ return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int i, ret = 0;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_xattr_tree_value_refcount_para *ref =
+ (struct ocfs2_xattr_tree_value_refcount_para *)para;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
+ struct ocfs2_post_refcount refcount = {
+ .credits = bucket->bu_blocks,
+ .para = bucket,
+ .func = ocfs2_xattr_bucket_post_refcount,
+ };
+ struct ocfs2_post_refcount *p = NULL;
+
+ /* We only need post_refcount if we support metaecc. */
+ if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+ p = &refcount;
+
+ trace_ocfs2_xattr_bucket_value_refcount(
+ (unsigned long long)bucket_blkno(bucket),
+ le16_to_cpu(xh->xh_count));
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+ &vb.vb_xv, &vb.vb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ocfs2_init_xattr_value_extent_tree(&et,
+ INODE_CACHE(inode), &vb);
+
+ ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+ &et, ref->ref_ci,
+ ref->ref_root_bh,
+ ref->dealloc, p);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno, u32 cpos, u32 len, void *para)
+{
+ return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+ ocfs2_xattr_bucket_value_refcount,
+ para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+ struct buffer_head *blk_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+
+ ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+ ref_ci, ref_root_bh,
+ dealloc);
+ } else {
+ struct ocfs2_xattr_tree_value_refcount_para para = {
+ .ref_ci = ref_ci,
+ .ref_root_bh = ref_root_bh,
+ .dealloc = dealloc,
+ };
+
+ ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+ ocfs2_refcount_xattr_tree_rec,
+ &para);
+ }
+
+ return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+ ref_ci, ref_root_bh,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (!di->i_xattr_loc)
+ goto out;
+
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+ ref_root_bh, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ brelse(blk_bh);
+out:
+
+ return ret;
+}
+
+typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
+/*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+ struct inode *old_inode;
+ struct inode *new_inode;
+ struct buffer_head *old_bh;
+ struct buffer_head *new_bh;
+ struct ocfs2_caching_info *ref_ci;
+ struct buffer_head *ref_root_bh;
+ struct ocfs2_cached_dealloc_ctxt *dealloc;
+ should_xattr_reflinked *xattr_reflinked;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int *metas, int *credits,
+ int *num_recs,
+ get_xattr_value_root *func,
+ void *para)
+{
+ int i, ret = 0;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe;
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = func(sb, bh, xh, i, &xv, NULL, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ *metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+ le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+ *credits += ocfs2_calc_extend_credits(sb,
+ &def_xv.xv.xr_list);
/*
- * We only replace the old value if the new length is smaller
- * than the old one. Otherwise we will allocate new space in the
- * bucket to store it.
+ * If the value is a tree with depth > 1, We don't go deep
+ * to the extent block, so just calculate a maximum record num.
*/
- xe = xs->here;
- if (ocfs2_xattr_is_local(xe))
- old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ if (!xv->xr_list.l_tree_depth)
+ *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
else
- old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+ *num_recs += ocfs2_clusters_for_bytes(sb,
+ XATTR_SIZE_MAX);
+ }
+
+ return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+ *xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+ le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+ if (ret_bh)
+ *ret_bh = bh;
- if (old >= value_size)
- need = 0;
+ return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+ struct ocfs2_xattr_header *xh,
+ struct buffer_head *ref_root_bh,
+ int *credits,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret, meta_add = 0, num_recs = 0;
+ struct ocfs2_refcount_block *rb =
+ (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+ *credits = 0;
+
+ ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+ &meta_add, credits, &num_recs,
+ ocfs2_get_xattr_value_root,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
/*
- * We need to make sure the new name/value pair
- * can exist in the same block.
+ * We need to add/modify num_recs in refcount tree, so just calculate
+ * an approximate number we need for refcount tree change.
+ * Sometimes we need to split the tree, and after split, half recs
+ * will be moved to the new block, and a new block can only provide
+ * half number of recs. So we multiple new blocks by 2.
*/
- if (xh_free_start % blocksize < need)
- free -= xh_free_start % blocksize;
-
- mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
- "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
- " %u\n", xs->not_found,
- (unsigned long long)bucket_blkno(xs->bucket),
- free, need, max_free, le16_to_cpu(xh->xh_free_start),
- le16_to_cpu(xh->xh_name_value_len));
-
- if (free < need ||
- (xs->not_found &&
- count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
- if (need <= max_free &&
- count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+ meta_add += num_recs;
+ *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+ le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+ else
+ *credits += 1;
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ *
+ * If args.xattr_reflinked is set, call it to decide whether the xe should
+ * be reflinked or not. If not, remove it from the new xattr header.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+ struct ocfs2_xattr_reflink *args,
+ struct buffer_head *old_bh,
+ struct ocfs2_xattr_header *xh,
+ struct buffer_head *new_bh,
+ struct ocfs2_xattr_header *new_xh,
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_alloc_context *meta_ac,
+ get_xattr_value_root *func,
+ void *para)
+{
+ int ret = 0, i, j;
+ struct super_block *sb = args->old_inode->i_sb;
+ struct buffer_head *value_bh;
+ struct ocfs2_xattr_entry *xe, *last;
+ struct ocfs2_xattr_value_root *xv, *new_xv;
+ struct ocfs2_extent_tree data_et;
+ u32 clusters, cpos, p_cluster, num_clusters;
+ unsigned int ext_flags = 0;
+
+ trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+ le16_to_cpu(xh->xh_count));
+
+ last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+ for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
+ xe = &xh->xh_entries[i];
+
+ if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
+ xe = &new_xh->xh_entries[j];
+
+ le16_add_cpu(&new_xh->xh_count, -1);
+ if (new_xh->xh_count) {
+ memmove(xe, xe + 1,
+ (void *)last - (void *)xe);
+ memset(last, 0,
+ sizeof(struct ocfs2_xattr_entry));
+ }
+
/*
- * We can create the space by defragment. Since only the
- * name/value will be moved, the xe shouldn't be changed
- * in xs.
+ * We don't want j to increase in the next round since
+ * it is already moved ahead.
*/
- ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
- xs->bucket);
+ j--;
+ continue;
+ }
+
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * For the xattr which has l_tree_depth = 0, all the extent
+ * recs have already be copied to the new xh with the
+ * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+ * increase the refount count int the refcount tree.
+ *
+ * For the xattr which has l_tree_depth > 0, we need
+ * to initialize it to the empty default value root,
+ * and then insert the extents one by one.
+ */
+ if (xv->xr_list.l_tree_depth) {
+ memcpy(new_xv, &def_xv, sizeof(def_xv));
+ vb->vb_xv = new_xv;
+ vb->vb_bh = value_bh;
+ ocfs2_init_xattr_value_extent_tree(&data_et,
+ INODE_CACHE(args->new_inode), vb);
+ }
+
+ clusters = le32_to_cpu(xv->xr_clusters);
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(args->old_inode,
+ cpos,
+ &p_cluster,
+ &num_clusters,
+ &xv->xr_list,
+ &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh_free_start = le16_to_cpu(xh->xh_free_start);
- free = xh_free_start - header_size
- - OCFS2_XATTR_HEADER_GAP;
- if (xh_free_start % blocksize < need)
- free -= xh_free_start % blocksize;
+ BUG_ON(!p_cluster);
- if (free >= need)
- goto xattr_set;
+ if (xv->xr_list.l_tree_depth) {
+ ret = ocfs2_insert_extent(handle,
+ &data_et, cpos,
+ ocfs2_clusters_to_blocks(
+ args->old_inode->i_sb,
+ p_cluster),
+ num_clusters, ext_flags,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_increase_refcount(handle, args->ref_ci,
+ args->ref_root_bh,
+ p_cluster, num_clusters,
+ meta_ac, args->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- mlog(0, "Can't get enough space for xattr insert by "
- "defragment. Need %u bytes, but we have %d, so "
- "allocate new bucket for it.\n", need, free);
+ cpos += num_clusters;
}
+ }
- /*
- * We have to add new buckets or clusters and one
- * allocation should leave us enough space for insert.
- */
- BUG_ON(allocation);
+out:
+ return ret;
+}
- /*
- * We do not allow for overlapping ranges between buckets. And
- * the maximum number of collisions we will allow for then is
- * one bucket's worth, so check it here whether we need to
- * add a new bucket for the insert.
- */
- ret = ocfs2_check_xattr_bucket_collision(inode,
- xs->bucket,
- xi->name);
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+ int ret = 0, credits = 0;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+ int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+ int header_off = osb->sb->s_blocksize - inline_size;
+ struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+ (args->old_bh->b_data + header_off);
+ struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+ (args->new_bh->b_data + header_off);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_inode_info *new_oi;
+ struct ocfs2_dinode *new_di;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = args->new_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+ &credits, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+ args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(args->new_bh->b_data + header_off,
+ args->old_bh->b_data + header_off, inline_size);
+
+ new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+ new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+ ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+ args->new_bh, new_xh, &vb, meta_ac,
+ ocfs2_get_xattr_value_root, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_oi = OCFS2_I(args->new_inode);
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ spin_lock(&new_oi->ip_lock);
+ new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+ new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+ spin_unlock(&new_oi->ip_lock);
+
+ ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head **ret_bh,
+ int indexed)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt;
+
+ memset(&ctxt, 0, sizeof(ctxt));
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_create_empty_xattr_block(
+ (unsigned long long)fe_bh->b_blocknr, indexed);
+ ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
+ ret_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+out:
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh,
+ struct buffer_head *new_blk_bh)
+{
+ int ret = 0, credits = 0;
+ handle_t *handle;
+ struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+ struct ocfs2_dinode *new_di;
+ struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+ int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+ struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_block *new_xb =
+ (struct ocfs2_xattr_block *)new_blk_bh->b_data;
+ struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = new_blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
+
+ ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+ &credits, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* One more credits in case we need to add xattr flags in new inode. */
+ handle = ocfs2_start_trans(osb, credits + 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+ ret = ocfs2_journal_access_di(handle,
+ INODE_CACHE(args->new_inode),
+ args->new_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
+ }
- ret = ocfs2_add_new_xattr_bucket(inode,
- xs->xattr_bh,
- xs->bucket,
- ctxt);
+ ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+ new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+ osb->sb->s_blocksize - header_off);
+
+ ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+ new_blk_bh, new_xh, &vb, meta_ac,
+ ocfs2_get_xattr_value_root, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_journal_dirty(handle, new_blk_bh);
+
+ if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+ new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+ spin_lock(&new_oi->ip_lock);
+ new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+ new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+ spin_unlock(&new_oi->ip_lock);
+
+ ocfs2_journal_dirty(handle, args->new_bh);
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+ struct ocfs2_xattr_reflink *reflink;
+ struct buffer_head *old_blk_bh;
+ struct buffer_head *new_blk_bh;
+ struct ocfs2_xattr_bucket *old_bucket;
+ struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_reflink_xattr_tree_args *args =
+ (struct ocfs2_reflink_xattr_tree_args *)para;
+ struct ocfs2_xattr_bucket *bucket;
+
+ if (bh == args->old_bucket->bu_bhs[0])
+ bucket = args->old_bucket;
+ else
+ bucket = args->new_bucket;
+
+ return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+ xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+ int num_metas;
+ int credits;
+ int num_recs;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *xh,
+ int offset,
+ struct ocfs2_xattr_value_root **xv,
+ struct buffer_head **ret_bh,
+ void *para)
+{
+ struct ocfs2_xattr_bucket *bucket =
+ (struct ocfs2_xattr_bucket *)para;
+
+ return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+ xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ struct ocfs2_value_tree_metas *metas =
+ (struct ocfs2_value_tree_metas *)para;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+
+ /* Add the credits for this bucket first. */
+ metas->credits += bucket->bu_blocks;
+ return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+ xh, &metas->num_metas,
+ &metas->credits, &metas->num_recs,
+ ocfs2_value_tree_metas_in_bucket,
+ bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *xt_et,
+ u64 blkno, u32 len, int *credits,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac)
+{
+ int ret, num_free_extents;
+ struct ocfs2_value_tree_metas metas;
+ struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+ struct ocfs2_refcount_block *rb;
+
+ memset(&metas, 0, sizeof(metas));
+
+ ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+ ocfs2_calc_value_tree_metas, &metas);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ *credits = metas.credits;
+
+ /*
+ * Calculate we need for refcount tree change.
+ *
+ * We need to add/modify num_recs in refcount tree, so just calculate
+ * an approximate number we need for refcount tree change.
+ * Sometimes we need to split the tree, and after split, half recs
+ * will be moved to the new block, and a new block can only provide
+ * half number of recs. So we multiple new blocks by 2.
+ * In the end, we have to add credits for modifying the already
+ * existed refcount block.
+ */
+ rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
+ metas.num_recs =
+ (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
+ ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+ metas.num_metas += metas.num_recs;
+ *credits += metas.num_recs +
+ metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+ if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+ *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+ le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+ else
+ *credits += 1;
+
+ /* count in the xattr tree change. */
+ num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (num_free_extents < len)
+ metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+ *credits += ocfs2_calc_extend_credits(osb->sb,
+ xt_et->et_root_el);
+
+ if (metas.num_metas) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+ meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
-
- /*
- * ocfs2_add_new_xattr_bucket() will have updated
- * xs->bucket if it moved, but it will not have updated
- * any of the other search fields. Thus, we drop it and
- * re-search. Everything should be cached, so it'll be
- * quick.
- */
- ocfs2_xattr_bucket_relse(xs->bucket);
- ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
- xi->name_index,
- xi->name, xs);
- if (ret && ret != -ENODATA)
- goto out;
- xs->not_found = ret;
- allocation = 1;
- goto try_again;
}
-xattr_set:
- ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
+ if (len) {
+ ret = ocfs2_reserve_clusters(osb, len, data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
out:
- mlog_exit(ret);
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
return ret;
}
-static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
- struct ocfs2_xattr_bucket *bucket,
- void *para)
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
+ u64 blkno, u64 new_blkno, u32 clusters,
+ u32 *cpos, int num_buckets,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_reflink_xattr_tree_args *args)
{
- int ret = 0;
- struct ocfs2_xattr_header *xh = bucket_xh(bucket);
- u16 i;
- struct ocfs2_xattr_entry *xe;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
- int credits = ocfs2_remove_extent_credits(osb->sb) +
- ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
+ int i, j, ret = 0;
+ struct super_block *sb = args->reflink->old_inode->i_sb;
+ int bpb = args->old_bucket->bu_blocks;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
- ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+ for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
- for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
- xe = &xh->xh_entries[i];
- if (ocfs2_xattr_is_local(xe))
- continue;
+ ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
- ctxt.handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(ctxt.handle)) {
- ret = PTR_ERR(ctxt.handle);
+ ret = ocfs2_xattr_bucket_journal_access(handle,
+ args->new_bucket,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
mlog_errno(ret);
break;
}
- ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
- i, 0, &ctxt);
+ for (j = 0; j < bpb; j++)
+ memcpy(bucket_block(args->new_bucket, j),
+ bucket_block(args->old_bucket, j),
+ sb->s_blocksize);
- ocfs2_commit_trans(osb, ctxt.handle);
+ /*
+ * Record the start cpos so that we can use it to initialize
+ * our xattr tree we also set the xh_num_bucket for the new
+ * bucket.
+ */
+ if (i == 0) {
+ *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+ xh_entries[0].xe_name_hash);
+ bucket_xh(args->new_bucket)->xh_num_buckets =
+ cpu_to_le16(num_buckets);
+ }
+
+ ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+ ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+ args->old_bucket->bu_bhs[0],
+ bucket_xh(args->old_bucket),
+ args->new_bucket->bu_bhs[0],
+ bucket_xh(args->new_bucket),
+ &vb, meta_ac,
+ ocfs2_get_reflink_xattr_value_root,
+ args);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ /*
+ * Re-access and dirty the bucket to calculate metaecc.
+ * Because we may extend the transaction in reflink_xattr_header
+ * which will let the already accessed block gone.
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle,
+ args->new_bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
break;
}
+
+ ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+ ocfs2_xattr_bucket_relse(args->new_bucket);
}
- ocfs2_schedule_truncate_log_flush(osb, 1);
- ocfs2_run_deallocs(osb, &ctxt.dealloc);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+ ocfs2_xattr_bucket_relse(args->new_bucket);
return ret;
}
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
- struct buffer_head *xb_bh)
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ u64 blkno, u32 cpos, u32 len)
{
- struct ocfs2_xattr_block *xb =
- (struct ocfs2_xattr_block *)xb_bh->b_data;
- struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
- int ret = 0;
- u32 name_hash = UINT_MAX, e_cpos, num_clusters;
- u64 p_blkno;
-
- if (le16_to_cpu(el->l_next_free_rec) == 0)
- return 0;
+ int ret, first_inserted = 0;
+ u32 p_cluster, num_clusters, reflink_cpos = 0;
+ u64 new_blkno;
+ unsigned int num_buckets, reflink_buckets;
+ unsigned int bpc =
+ ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
- while (name_hash > 0) {
- ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
- &e_cpos, &num_clusters, el);
+ while (len && num_buckets) {
+ ret = ocfs2_claim_clusters(handle, data_ac,
+ 1, &p_cluster, &num_clusters);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
- ocfs2_delete_xattr_in_bucket,
- NULL);
+ new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+ ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+ new_blkno, num_clusters,
+ &reflink_cpos, reflink_buckets,
+ meta_ac, data_ac, args);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
- p_blkno, e_cpos, num_clusters);
+ /*
+ * For the 1st allocated cluster, we make it use the same cpos
+ * so that the xattr tree looks the same as the original one
+ * in the most case.
+ */
+ if (!first_inserted) {
+ reflink_cpos = cpos;
+ first_inserted = 1;
+ }
+ ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+ num_clusters, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+ trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+ num_clusters, reflink_cpos);
+
+ len -= num_clusters;
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ num_buckets -= reflink_buckets;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len,
+ void *para)
+{
+ int ret, credits = 0;
+ handle_t *handle;
+ struct ocfs2_reflink_xattr_tree_args *args =
+ (struct ocfs2_reflink_xattr_tree_args *)para;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_extent_tree et;
+
+ trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
+
+ ocfs2_init_xattr_tree_extent_tree(&et,
+ INODE_CACHE(args->reflink->new_inode),
+ args->new_blk_bh);
+
+ ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+ len, &credits,
+ &meta_ac, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+ meta_ac, data_ac,
+ blkno, cpos, len);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh,
+ struct buffer_head *new_blk_bh)
+{
+ int ret;
+ struct ocfs2_reflink_xattr_tree_args para;
+
+ memset(&para, 0, sizeof(para));
+ para.reflink = args;
+ para.old_blk_bh = blk_bh;
+ para.new_blk_bh = new_blk_bh;
+
+ para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+ if (!para.old_bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
+ para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+ if (!para.new_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+ ocfs2_reflink_xattr_rec,
+ &para);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_xattr_bucket_free(para.old_bucket);
+ ocfs2_xattr_bucket_free(para.new_bucket);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+ struct buffer_head *blk_bh)
+{
+ int ret, indexed = 0;
+ struct buffer_head *new_blk_bh = NULL;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+ indexed = 1;
+
+ ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+ &new_blk_bh, indexed);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!indexed)
+ ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+ else
+ ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(new_blk_bh);
+ return ret;
+}
+
+static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
+{
+ int type = ocfs2_xattr_get_type(xe);
+
+ return type != OCFS2_XATTR_INDEX_SECURITY &&
+ type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
+ type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ struct buffer_head *new_bh,
+ bool preserve_security)
+{
+ int ret;
+ struct ocfs2_xattr_reflink args;
+ struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct buffer_head *ref_root_bh = NULL;
+
+ ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+ le64_to_cpu(di->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ args.old_inode = old_inode;
+ args.new_inode = new_inode;
+ args.old_bh = old_bh;
+ args.new_bh = new_bh;
+ args.ref_ci = &ref_tree->rf_ci;
+ args.ref_root_bh = ref_root_bh;
+ args.dealloc = &dealloc;
+ if (preserve_security)
+ args.xattr_reflinked = NULL;
+ else
+ args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_reflink_xattr_inline(&args);
if (ret) {
mlog_errno(ret);
- break;
+ goto out_unlock;
}
+ }
- if (e_cpos == 0)
- break;
+ if (!di->i_xattr_loc)
+ goto out_unlock;
- name_hash = e_cpos - 1;
+ ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ brelse(blk_bh);
+
+out_unlock:
+ ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+ ref_tree, 1);
+ brelse(ref_root_bh);
+
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+ ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
}
out:
@@ -5304,11 +7199,49 @@ out:
}
/*
+ * Initialize security and acl for a already created inode.
+ * Used for reflink a non-preserve-security file.
+ *
+ * It uses common api like ocfs2_xattr_set, so the caller
+ * must not hold any lock expect i_mutex.
+ */
+int ocfs2_init_security_and_acl(struct inode *dir,
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl)
+{
+ struct buffer_head *dir_bh = NULL;
+ int ret = 0;
+
+ ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_inode_lock(dir, &dir_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ if (!ret && default_acl)
+ ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (!ret && acl)
+ ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+
+ ocfs2_inode_unlock(dir, 0);
+ brelse(dir_bh);
+leave:
+ return ret;
+}
+/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
size_t list_size, const char *name,
- size_t name_len)
+ size_t name_len, int type)
{
const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -5321,34 +7254,56 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
return total_len;
}
-static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
- buffer, size);
+ return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ name, buffer, size);
}
-static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
- size, flags);
+ return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ name, value, size, flags);
+}
+
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, XATTR_CREATE);
+ if (err)
+ break;
+ }
+ return err;
}
int ocfs2_init_security_get(struct inode *inode,
struct inode *dir,
+ const struct qstr *qstr,
struct ocfs2_security_xattr_info *si)
{
/* check whether ocfs2 support feature xattr */
if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
return -EOPNOTSUPP;
- return security_inode_init_security(inode, dir, &si->name, &si->value,
- &si->value_len);
+ if (si)
+ return security_old_inode_init_security(inode, dir, qstr,
+ &si->name, &si->value,
+ &si->value_len);
+
+ return security_inode_init_security(inode, dir, qstr,
+ &ocfs2_initxattrs, NULL);
}
int ocfs2_init_security_set(handle_t *handle,
@@ -5364,7 +7319,7 @@ int ocfs2_init_security_set(handle_t *handle,
xattr_ac, data_ac);
}
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
@@ -5374,9 +7329,9 @@ struct xattr_handler ocfs2_xattr_security_handler = {
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
size_t list_size, const char *name,
- size_t name_len)
+ size_t name_len, int type)
{
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -5389,26 +7344,26 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
return total_len;
}
-static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
- buffer, size);
+ return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ name, buffer, size);
}
-static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
- size, flags);
+ return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ name, value, size, flags);
}
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
@@ -5418,13 +7373,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
size_t list_size, const char *name,
- size_t name_len)
+ size_t name_len, int type)
{
const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return 0;
@@ -5437,34 +7392,34 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
return total_len;
}
-static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
if (strcmp(name, "") == 0)
return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+ return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
-static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
if (strcmp(name, "") == 0)
return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
- size, flags);
+ return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
+ name, value, size, flags);
}
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a1b7b..f10d5b93c36 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,19 +32,15 @@ enum ocfs2_xattr_type {
struct ocfs2_security_xattr_info {
int enable;
- char *name;
+ const char *name;
void *value;
size_t value_len;
};
-extern struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-#endif
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
@@ -55,8 +51,11 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
int, const char *, const void *, size_t, int,
struct ocfs2_alloc_context *,
struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+ struct ocfs2_dinode *di);
int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
int ocfs2_init_security_get(struct inode *, struct inode *,
+ const struct qstr *,
struct ocfs2_security_xattr_info *);
int ocfs2_init_security_set(handle_t *, struct inode *,
struct buffer_head *,
@@ -67,7 +66,7 @@ int ocfs2_calc_security_init(struct inode *,
struct ocfs2_security_xattr_info *,
int *, int *, struct ocfs2_alloc_context **);
int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
- int, struct ocfs2_security_xattr_info *,
+ umode_t, struct ocfs2_security_xattr_info *,
int *, int *, int *);
/*
@@ -83,5 +82,19 @@ struct ocfs2_xattr_value_buf {
struct ocfs2_xattr_value_root *vb_xv;
};
-
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_caching_info *ref_ci,
+ struct buffer_head *ref_root_bh,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+ struct buffer_head *old_bh,
+ struct inode *new_inode,
+ struct buffer_head *new_bh,
+ bool preserve_security);
+int ocfs2_init_security_and_acl(struct inode *dir,
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl);
#endif /* OCFS2_XATTR_H */