aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Kconfig5
-rw-r--r--fs/ocfs2/Makefile7
-rw-r--r--fs/ocfs2/acl.c324
-rw-r--r--fs/ocfs2/acl.h15
-rw-r--r--fs/ocfs2/alloc.c1413
-rw-r--r--fs/ocfs2/alloc.h17
-rw-r--r--fs/ocfs2/aops.c395
-rw-r--r--fs/ocfs2/aops.h38
-rw-r--r--fs/ocfs2/blockcheck.c50
-rw-r--r--fs/ocfs2/buffer_head_io.c58
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1089
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.c20
-rw-r--r--fs/ocfs2/cluster/masklog.h115
-rw-r--r--fs/ocfs2/cluster/netdebug.c364
-rw-r--r--fs/ocfs2/cluster/nodemanager.c10
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/quorum.c11
-rw-r--r--fs/ocfs2/cluster/sys.c11
-rw-r--r--fs/ocfs2/cluster/tcp.c505
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h37
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c172
-rw-r--r--fs/ocfs2/dcache.h13
-rw-r--r--fs/ocfs2/dir.c407
-rw-r--r--fs/ocfs2/dir.h5
-rw-r--r--fs/ocfs2/dlm/Makefile4
-rw-r--r--fs/ocfs2/dlm/dlmast.c96
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h193
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c29
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c249
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h5
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c644
-rw-r--r--fs/ocfs2/dlm/dlmlock.c79
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c611
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c283
-rw-r--r--fs/ocfs2/dlm/dlmthread.c277
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c30
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c111
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmglue.c383
-rw-r--r--fs/ocfs2/dlmglue.h4
-rw-r--r--fs/ocfs2/export.c80
-rw-r--r--fs/ocfs2/extent_map.c111
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c1253
-rw-r--r--fs/ocfs2/file.h6
-rw-r--r--fs/ocfs2/heartbeat.c5
-rw-r--r--fs/ocfs2/inode.c393
-rw-r--r--fs/ocfs2/inode.h39
-rw-r--r--fs/ocfs2/ioctl.c880
-rw-r--r--fs/ocfs2/journal.c350
-rw-r--r--fs/ocfs2/journal.h57
-rw-r--r--fs/ocfs2/localalloc.c441
-rw-r--r--fs/ocfs2/localalloc.h9
-rw-r--r--fs/ocfs2/locks.c5
-rw-r--r--fs/ocfs2/mmap.c130
-rw-r--r--fs/ocfs2/move_extents.c1078
-rw-r--r--fs/ocfs2/move_extents.h22
-rw-r--r--fs/ocfs2/namei.c935
-rw-r--r--fs/ocfs2/ocfs2.h216
-rw-r--r--fs/ocfs2/ocfs2_fs.h231
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h171
-rw-r--r--fs/ocfs2/ocfs2_trace.h2768
-rw-r--r--fs/ocfs2/quota.h17
-rw-r--r--fs/ocfs2/quota_global.c493
-rw-r--r--fs/ocfs2/quota_local.c254
-rw-r--r--fs/ocfs2/refcounttree.c464
-rw-r--r--fs/ocfs2/refcounttree.h22
-rw-r--r--fs/ocfs2/reservations.c839
-rw-r--r--fs/ocfs2/reservations.h159
-rw-r--r--fs/ocfs2/resize.c64
-rw-r--r--fs/ocfs2/slot_map.c22
-rw-r--r--fs/ocfs2/stack_o2cb.c75
-rw-r--r--fs/ocfs2/stack_user.c315
-rw-r--r--fs/ocfs2/stackglue.c50
-rw-r--r--fs/ocfs2/stackglue.h17
-rw-r--r--fs/ocfs2/suballoc.c1284
-rw-r--r--fs/ocfs2/suballoc.h63
-rw-r--r--fs/ocfs2/super.c613
-rw-r--r--fs/ocfs2/super.h21
-rw-r--r--fs/ocfs2/symlink.c121
-rw-r--r--fs/ocfs2/symlink.h2
-rw-r--r--fs/ocfs2/sysfile.c64
-rw-r--r--fs/ocfs2/uptodate.c75
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/xattr.c637
-rw-r--r--fs/ocfs2/xattr.h20
96 files changed, 15920 insertions, 7343 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 0d840669698..77a8de5f711 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -1,7 +1,6 @@
config OCFS2_FS
tristate "OCFS2 file system support"
- depends on NET && SYSFS
- select CONFIGFS_FS
+ depends on NET && SYSFS && CONFIGFS_FS
select JBD2
select CRC32
select QUOTA
@@ -51,7 +50,7 @@ config OCFS2_FS_USERSPACE_CLUSTER
config OCFS2_FS_STATS
bool "OCFS2 statistics"
- depends on OCFS2_FS
+ depends on OCFS2_FS && DEBUG_FS
default y
help
This option allows some fs statistics to be captured. Enabling
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c06..ce210d4951a 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+ccflags-y += -DCATCH_BH_JBD_RACES
obj-$(CONFIG_OCFS2_FS) += \
ocfs2.o \
@@ -29,6 +29,8 @@ ocfs2-objs := \
mmap.o \
namei.o \
refcounttree.o \
+ reservations.o \
+ move_extents.o \
resize.o \
slot_map.o \
suballoc.o \
@@ -36,7 +38,6 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
quota_local.o \
quota_global.o \
xattr.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd..7e8282dcea2 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,15 +21,17 @@
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/string.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
#include "file.h"
+#include "inode.h"
+#include "journal.h"
#include "ocfs2_fs.h"
#include "xattr.h"
@@ -49,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
return ERR_PTR(-EINVAL);
count = size / sizeof(struct posix_acl_entry);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
@@ -63,7 +61,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ acl->a_entries[n].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ case ACL_GROUP:
+ acl->a_entries[n].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ break;
+ default:
+ break;
+ }
value += sizeof(struct posix_acl_entry);
}
@@ -89,7 +100,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
for (n = 0; n < acl->a_count; n++, entry++) {
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns,
+ acl->a_entries[n].e_uid));
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns,
+ acl->a_entries[n].e_gid));
+ break;
+ default:
+ entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
}
return ocfs2_acl;
}
@@ -135,40 +160,68 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
return acl;
}
-
/*
- * Get posix acl.
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
*/
-static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, umode_t new_mode)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- struct posix_acl *acl;
- int ret;
+ int ret, commit_handle = 0;
+ struct ocfs2_dinode *di;
+
+ if (di_bh == NULL) {
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else
+ get_bh(di_bh);
+
+ if (handle == NULL) {
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_brelse;
+ }
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return NULL;
+ commit_handle = 1;
+ }
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
mlog_errno(ret);
- acl = ERR_PTR(ret);
- return acl;
+ goto out_commit;
}
- acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ inode->i_mode = new_mode;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_mode = cpu_to_le16(inode->i_mode);
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+ if (commit_handle)
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
brelse(di_bh);
-
- return acl;
+out:
+ return ret;
}
/*
* Set the access or default ACL of an inode.
*/
-static int ocfs2_set_acl(handle_t *handle,
+int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
@@ -188,14 +241,19 @@ static int ocfs2_set_acl(handle_t *handle,
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
ret = posix_acl_equiv_mode(acl, &mode);
if (ret < 0)
return ret;
else {
- inode->i_mode = mode;
if (ret == 0)
acl = NULL;
+
+ ret = ocfs2_acl_set_mode(inode, di_bh,
+ handle, mode);
+ if (ret)
+ return ret;
+
}
}
break;
@@ -226,213 +284,29 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
-int ocfs2_check_acl(struct inode *inode, int mask)
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
-
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- int ret = posix_acl_permission(inode, acl, mask);
- posix_acl_release(acl);
- return ret;
- }
-
- return -EAGAIN;
-}
-
-int ocfs2_acl_chmod(struct inode *inode)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl, *clone;
- int ret;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- clone = posix_acl_clone(acl, GFP_KERNEL);
- posix_acl_release(acl);
- if (!clone)
- return -ENOMEM;
- ret = posix_acl_chmod_masq(clone, inode->i_mode);
- if (!ret)
- ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
- clone, NULL, NULL);
- posix_acl_release(clone);
- return ret;
+ return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
}
-/*
- * Initialize the ACLs of a new inode. If parent directory has default ACL,
- * then clone to new inode. Called from ocfs2_mknod.
- */
-int ocfs2_init_acl(handle_t *handle,
- struct inode *inode,
- struct inode *dir,
- struct buffer_head *di_bh,
- struct buffer_head *dir_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac)
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl = NULL;
- int ret = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
- acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
- dir_bh);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
- struct posix_acl *clone;
- mode_t mode;
-
- if (S_ISDIR(inode->i_mode)) {
- ret = ocfs2_set_acl(handle, inode, di_bh,
- ACL_TYPE_DEFAULT, acl,
- meta_ac, data_ac);
- if (ret)
- goto cleanup;
- }
- clone = posix_acl_clone(acl, GFP_NOFS);
- ret = -ENOMEM;
- if (!clone)
- goto cleanup;
-
- mode = inode->i_mode;
- ret = posix_acl_create_masq(clone, &mode);
- if (ret >= 0) {
- inode->i_mode = mode;
- if (ret > 0) {
- ret = ocfs2_set_acl(handle, inode,
- di_bh, ACL_TYPE_ACCESS,
- clone, meta_ac, data_ac);
- }
- }
- posix_acl_release(clone);
- }
-cleanup:
- posix_acl_release(acl);
- return ret;
-}
-
-static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len,
- int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len,
- int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb;
+ struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret;
+ int ret = -EAGAIN;
- if (strcmp(name, "") != 0)
- return -EINVAL;
+ osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ocfs2_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- ret = posix_acl_to_xattr(acl, buffer, size);
- posix_acl_release(acl);
-
- return ret;
-}
-
-static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl;
- int ret = 0;
+ return NULL;
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
+ ret = ocfs2_read_inode_block(inode, &di_bh);
+ if (ret < 0)
+ return ERR_PTR(ret);
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- ret = posix_acl_valid(acl);
- if (ret)
- goto cleanup;
- }
- } else
- acl = NULL;
+ acl = ocfs2_get_acl_nolock(inode, type, di_bh);
- ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ brelse(di_bh);
-cleanup:
- posix_acl_release(acl);
- return ret;
+ return acl;
}
-
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ocfs2_xattr_list_acl_access,
- .get = ocfs2_xattr_get_acl,
- .set = ocfs2_xattr_set_acl,
-};
-
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ocfs2_xattr_list_acl_default,
- .get = ocfs2_xattr_get_acl,
- .set = ocfs2_xattr_set_acl,
-};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 5c5d31f0585..3fce68d0862 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,11 +26,14 @@ struct ocfs2_acl_entry {
__le32 e_id;
};
-extern int ocfs2_check_acl(struct inode *, int);
-extern int ocfs2_acl_chmod(struct inode *);
-extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
- struct buffer_head *, struct buffer_head *,
- struct ocfs2_alloc_context *,
- struct ocfs2_alloc_context *);
+struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51..9d8fcf2f3b9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,8 +29,8 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -50,6 +50,7 @@
#include "uptodate.h"
#include "xattr.h"
#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -565,7 +566,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
return ret;
}
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
static void ocfs2_adjust_rightmost_records(handle_t *handle,
@@ -887,8 +887,7 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
struct ocfs2_extent_block *eb =
(struct ocfs2_extent_block *)bh->b_data;
- mlog(0, "Validating extent block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -966,8 +965,6 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct buffer_head *eb_bh = NULL;
u64 last_eb_blk = 0;
- mlog_entry_void();
-
el = et->et_root_el;
last_eb_blk = ocfs2_et_get_last_eb_blk(et);
@@ -988,7 +985,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
bail:
brelse(eb_bh);
- mlog_exit(retval);
+ trace_ocfs2_num_free_extents(retval);
return retval;
}
@@ -1006,19 +1003,17 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
int count, status, i;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
struct ocfs2_super *osb =
OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
struct ocfs2_extent_block *eb;
- mlog_entry_void();
-
count = 0;
while (count < wanted) {
- status = ocfs2_claim_metadata(osb,
- handle,
+ status = ocfs2_claim_metadata(handle,
meta_ac,
wanted - count,
+ &suballoc_loc,
&suballoc_bit_start,
&num_got,
&first_blkno);
@@ -1030,7 +1025,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
for(i = count; i < (num_got + count); i++) {
bhs[i] = sb_getblk(osb->sb, first_blkno);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -1052,6 +1047,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
eb->h_suballoc_slot =
cpu_to_le16(meta_ac->ac_alloc_slot);
+ eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1057,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
/* We'll also be dirtied by the caller, so
* this isn't absolutely necessary. */
- status = ocfs2_journal_dirty(handle, bhs[i]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[i]);
}
count += num_got;
@@ -1078,8 +1070,8 @@ bail:
brelse(bhs[i]);
bhs[i] = NULL;
}
+ mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -1129,8 +1121,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
goto out;
}
- status = ocfs2_extend_trans(handle, path_num_items(path) +
- handle->h_buffer_credits);
+ status = ocfs2_extend_trans(handle, path_num_items(path));
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1143,7 +1134,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
}
el = path_leaf_el(path);
- rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
ocfs2_adjust_rightmost_records(handle, et, path, rec);
@@ -1178,8 +1169,6 @@ static int ocfs2_add_branch(handle_t *handle,
struct ocfs2_extent_list *el;
u32 new_cpos, root_end;
- mlog_entry_void();
-
BUG_ON(!last_eb_bh || !*last_eb_bh);
if (eb_bh) {
@@ -1205,8 +1194,11 @@ static int ocfs2_add_branch(handle_t *handle,
* from new_cpos).
*/
if (root_end > new_cpos) {
- mlog(0, "adjust the cluster end from %u to %u\n",
- root_end, new_cpos);
+ trace_ocfs2_adjust_rightmost_branch(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ root_end, new_cpos);
+
status = ocfs2_adjust_rightmost_branch(handle, et);
if (status) {
mlog_errno(status);
@@ -1270,12 +1262,7 @@ static int ocfs2_add_branch(handle_t *handle,
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
+ ocfs2_journal_dirty(handle, bh);
next_blkno = le64_to_cpu(eb->h_blkno);
}
@@ -1321,17 +1308,10 @@ static int ocfs2_add_branch(handle_t *handle,
eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
- status = ocfs2_journal_dirty(handle, *last_eb_bh);
- if (status < 0)
- mlog_errno(status);
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0)
- mlog_errno(status);
- if (eb_bh) {
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0)
- mlog_errno(status);
- }
+ ocfs2_journal_dirty(handle, *last_eb_bh);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (eb_bh)
+ ocfs2_journal_dirty(handle, eb_bh);
/*
* Some callers want to track the rightmost leaf so pass it
@@ -1349,7 +1329,6 @@ bail:
kfree(new_eb_bhs);
}
- mlog_exit(status);
return status;
}
@@ -1370,8 +1349,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
- mlog_entry_void();
-
status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
&new_eb_bh);
if (status < 0) {
@@ -1399,11 +1376,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
eb_el->l_recs[i] = root_el->l_recs[i];
- status = ocfs2_journal_dirty(handle, new_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_eb_bh);
status = ocfs2_et_root_journal_access(handle, et,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1401,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
if (root_el->l_tree_depth == cpu_to_le16(1))
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
*ret_new_eb_bh = new_eb_bh;
new_eb_bh = NULL;
@@ -1440,7 +1409,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
bail:
brelse(new_eb_bh);
- mlog_exit(status);
return status;
}
@@ -1471,8 +1439,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
struct buffer_head *bh = NULL;
struct buffer_head *lowest_bh = NULL;
- mlog_entry_void();
-
*target_bh = NULL;
el = et->et_root_el;
@@ -1528,7 +1494,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
bail:
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -1565,7 +1530,10 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
* another tree level */
if (shift) {
BUG_ON(bh);
- mlog(0, "need to shift tree depth (current = %d)\n", depth);
+ trace_ocfs2_grow_tree(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ depth);
/* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to
@@ -1595,7 +1563,6 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
/* call ocfs2_add_branch to add the final part of the tree with
* the new data. */
- mlog(0, "add branch. bh = %p\n", bh);
ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
meta_ac);
if (ret < 0) {
@@ -1670,8 +1637,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
}
insert_index = i;
- mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
- insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+ trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+ has_empty, next_free,
+ le16_to_cpu(el->l_count));
BUG_ON(insert_index < 0);
BUG_ON(insert_index >= le16_to_cpu(el->l_count));
@@ -2064,7 +2032,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
struct ocfs2_path *right_path,
int subtree_index)
{
- int ret, i, idx;
+ int i, idx;
struct ocfs2_extent_list *el, *left_el, *right_el;
struct ocfs2_extent_rec *left_rec, *right_rec;
struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2084,7 +2052,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
left_el = path_leaf_el(left_path);
right_el = path_leaf_el(right_path);
for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
- mlog(0, "Adjust records at index %u\n", i);
+ trace_ocfs2_complete_edge_insert(i);
/*
* One nice property of knowing that all of these
@@ -2102,13 +2070,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
right_el);
- ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
- if (ret)
- mlog_errno(ret);
-
- ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+ ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
/*
* Setup our list pointers now so that the current
@@ -2132,9 +2095,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
root_bh = left_path->p_node[subtree_index].bh;
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, root_bh);
}
static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2168,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
ocfs2_create_empty_extent(right_el);
- ret = ocfs2_journal_dirty(handle, right_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, right_leaf_bh);
/* Do the copy now. */
i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2187,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
le16_add_cpu(&left_el->l_next_free_rec, 1);
- ret = ocfs2_journal_dirty(handle, left_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, left_leaf_bh);
ocfs2_complete_edge_insert(handle, left_path, right_path,
subtree_index);
@@ -2249,8 +2202,8 @@ out:
*
* Will return zero if the path passed in is already the leftmost path.
*/
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
- struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
{
int i, j, ret = 0;
u64 blkno;
@@ -2327,20 +2280,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int op_credits,
struct ocfs2_path *path)
{
- int ret;
+ int ret = 0;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
- if (handle->h_buffer_credits < credits) {
+ if (handle->h_buffer_credits < credits)
ret = ocfs2_extend_trans(handle,
credits - handle->h_buffer_credits);
- if (ret)
- return ret;
-
- if (unlikely(handle->h_buffer_credits < credits))
- return ocfs2_extend_trans(handle, credits);
- }
- return 0;
+ return ret;
}
/*
@@ -2435,7 +2382,9 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
goto out;
}
- mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+ trace_ocfs2_rotate_tree_right(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos);
/*
* What we want to do here is:
@@ -2464,8 +2413,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
* rotating subtrees.
*/
while (cpos && insert_cpos <= cpos) {
- mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
- insert_cpos, cpos);
+ trace_ocfs2_rotate_tree_right(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ insert_cpos, cpos);
ret = ocfs2_find_path(et->et_ci, left_path, cpos);
if (ret) {
@@ -2507,10 +2458,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
start = ocfs2_find_subtree_root(et, left_path, right_path);
- mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
- start,
- (unsigned long long) right_path->p_node[start].bh->b_blocknr,
- right_path->p_tree_depth);
+ trace_ocfs2_rotate_subtree(start,
+ (unsigned long long)
+ right_path->p_node[start].bh->b_blocknr,
+ right_path->p_tree_depth);
ret = ocfs2_extend_rotate_transaction(handle, start,
orig_credits, right_path);
@@ -2584,8 +2535,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
* records for all the bh in the path.
* So we have to allocate extra credits and access them.
*/
- ret = ocfs2_extend_trans(handle,
- handle->h_buffer_credits + subtree_index);
+ ret = ocfs2_extend_trans(handle, subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2823,12 +2773,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
ocfs2_remove_empty_extent(right_leaf_el);
}
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
if (del_right_subtree) {
ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2797,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
if (right_has_empty)
ocfs2_remove_empty_extent(left_leaf_el);
- ret = ocfs2_journal_dirty(handle, et_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, et_root_bh);
*deleted = 1;
} else
@@ -2962,10 +2906,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
}
ocfs2_remove_empty_extent(el);
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out:
return ret;
@@ -3020,8 +2961,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
subtree_root = ocfs2_find_subtree_root(et, left_path,
right_path);
- mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
- subtree_root,
+ trace_ocfs2_rotate_subtree(subtree_root,
(unsigned long long)
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
@@ -3506,15 +3446,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
ocfs2_cleanup_merge(el, index);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
if (right_path) {
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
ocfs2_complete_edge_insert(handle, left_path, right_path,
subtree_index);
}
@@ -3683,14 +3617,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
ocfs2_cleanup_merge(el, index);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
if (left_path) {
- ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
/*
* In the situation that the right_rec is empty and the extent
@@ -4016,10 +3945,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
le32_add_cpu(&rec->e_int_clusters,
-le32_to_cpu(rec->e_cpos));
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
+ ocfs2_journal_dirty(handle, bh);
}
}
@@ -4059,9 +3985,11 @@ static int ocfs2_append_rec_to_path(handle_t *handle,
goto out;
}
- mlog(0, "Append may need a left path update. cpos: %u, "
- "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
- left_cpos);
+ trace_ocfs2_append_rec_to_path(
+ (unsigned long long)
+ ocfs2_metadata_cache_owner(et->et_ci),
+ le32_to_cpu(insert_rec->e_cpos),
+ left_cpos);
/*
* No need to worry if the append is already in the
@@ -4203,17 +4131,13 @@ static int ocfs2_insert_path(handle_t *handle,
struct buffer_head *leaf_bh = path_leaf_bh(right_path);
if (left_path) {
- int credits = handle->h_buffer_credits;
-
/*
* There's a chance that left_path got passed back to
* us without being accounted for in the
* journal. Extend our transaction here to be sure we
* can change those blocks.
*/
- credits += left_path->p_tree_depth;
-
- ret = ocfs2_extend_trans(handle, credits);
+ ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4251,17 +4175,13 @@ static int ocfs2_insert_path(handle_t *handle,
* dirty this for us.
*/
if (left_path)
- ret = ocfs2_journal_dirty(handle,
- path_leaf_bh(left_path));
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle,
+ path_leaf_bh(left_path));
} else
ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
insert);
- ret = ocfs2_journal_dirty(handle, leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, leaf_bh);
if (left_path) {
/*
@@ -4384,9 +4304,7 @@ out_update_clusters:
ocfs2_et_update_clusters(et,
le16_to_cpu(insert_rec->e_leaf_clusters));
- ret = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, et->et_root_bh);
out:
ocfs2_free_path(left_path);
@@ -4602,7 +4520,7 @@ set_tail_append:
}
/*
- * Helper function called at the begining of an insert.
+ * Helper function called at the beginning of an insert.
*
* This computes a few things that are commonly used in the process of
* inserting into the btree:
@@ -4642,7 +4560,7 @@ static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
ocfs2_et_get_last_eb_blk(et),
&bh);
if (ret) {
- mlog_exit(ret);
+ mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
@@ -4758,9 +4676,9 @@ int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
- mlog(0, "add %u clusters at position %u to owner %llu\n",
- new_clusters, cpos,
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+ trace_ocfs2_insert_extent_start(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, new_clusters);
memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
@@ -4780,11 +4698,9 @@ int ocfs2_insert_extent(handle_t *handle,
goto bail;
}
- mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
- "Insert.contig_index: %d, Insert.free_records: %d, "
- "Insert.tree_depth: %d\n",
- insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
- free_records, insert.ins_tree_depth);
+ trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+ insert.ins_contig_index, free_records,
+ insert.ins_tree_depth);
if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
status = ocfs2_grow_tree(handle, et,
@@ -4806,7 +4722,6 @@ int ocfs2_insert_extent(handle_t *handle,
bail:
brelse(last_eb_bh);
- mlog_exit(status);
return status;
}
@@ -4826,7 +4741,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int status = 0;
+ int status = 0, err = 0;
+ int need_free = 0;
int free_extents;
enum ocfs2_alloc_restarted reason = RESTART_NONE;
u32 bit_off, num_bits;
@@ -4853,20 +4769,20 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
* 2) we are so fragmented, we've needed to add metadata too
* many times. */
if (!free_extents && !meta_ac) {
- mlog(0, "we haven't reserved any metadata!\n");
+ err = -1;
status = -EAGAIN;
reason = RESTART_META;
goto leave;
} else if ((!free_extents)
&& (ocfs2_alloc_context_bits_left(meta_ac)
< ocfs2_extend_meta_needed(et->et_root_el))) {
- mlog(0, "filesystem is really fragmented...\n");
+ err = -2;
status = -EAGAIN;
reason = RESTART_META;
goto leave;
}
- status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ status = __ocfs2_claim_clusters(handle, data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (status < 0) {
if (status != -ENOSPC)
@@ -4881,40 +4797,50 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
- num_bits, bit_off,
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+ trace_ocfs2_add_clusters_in_btree(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ bit_off, num_bits);
status = ocfs2_insert_extent(handle, et, *logical_offset, block,
num_bits, flags, meta_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
- status = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
clusters_to_add -= num_bits;
*logical_offset += num_bits;
if (clusters_to_add) {
- mlog(0, "need to alloc once more, wanted = %u\n",
- clusters_to_add);
+ err = clusters_to_add;
status = -EAGAIN;
reason = RESTART_TRANS;
}
+bail:
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num_bits);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num_bits);
+ }
+
leave:
- mlog_exit(status);
if (reason_ret)
*reason_ret = reason;
+ trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
return status;
}
@@ -5123,7 +5049,7 @@ int ocfs2_split_extent(handle_t *handle,
ocfs2_et_get_last_eb_blk(et),
&last_eb_bh);
if (ret) {
- mlog_exit(ret);
+ mlog_errno(ret);
goto out;
}
@@ -5140,9 +5066,9 @@ int ocfs2_split_extent(handle_t *handle,
ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
- mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
- split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
- ctxt.c_split_covers_rec);
+ trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+ ctxt.c_has_empty_extent,
+ ctxt.c_split_covers_rec);
if (ctxt.c_contig_type == CONTIG_NONE) {
if (ctxt.c_split_covers_rec)
@@ -5276,8 +5202,9 @@ int ocfs2_mark_extent_written(struct inode *inode,
{
int ret;
- mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
- inode->i_ino, cpos, len, phys);
+ trace_ocfs2_mark_extent_written(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
@@ -5309,7 +5236,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
int index, u32 new_range,
struct ocfs2_alloc_context *meta_ac)
{
- int ret, depth, credits = handle->h_buffer_credits;
+ int ret, depth, credits;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5267,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
} else
rightmost_el = path_leaf_el(path);
- credits += path->p_tree_depth +
- ocfs2_extend_meta_needed(et->et_root_el);
+ credits = path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -5596,11 +5523,10 @@ int ocfs2_remove_extent(handle_t *handle,
BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
- mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
- "(cpos %u, len %u)\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- cpos, len, index,
- le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+ trace_ocfs2_remove_extent(
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos, len, index, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
@@ -5671,22 +5597,100 @@ out:
return ret;
}
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **ac,
+ int extra_blocks)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *ac = NULL;
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ if (extra_blocks) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ if (ret) {
+ if (*ac) {
+ ocfs2_free_alloc_context(*ac);
+ *ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc)
{
- int ret;
+ int ret, credits = 0, extra_blocks = 0;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
handle_t *handle;
struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+
+ if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
- ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+ }
+
+ ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+ extra_blocks);
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
mutex_lock(&tl_inode->i_mutex);
@@ -5699,7 +5703,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ handle = ocfs2_start_trans(osb,
+ ocfs2_remove_extent_credits(osb->sb) + credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
@@ -5710,7 +5715,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
dquot_free_space_nodirty(inode,
@@ -5723,25 +5728,36 @@ int ocfs2_remove_btree_range(struct inode *inode,
}
ocfs2_et_update_clusters(et, -len);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ret = ocfs2_journal_dirty(handle, et->et_root_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, et->et_root_bh);
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
- if (ret)
- mlog_errno(ret);
+ if (phys_blkno) {
+ if (flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ phys_blkno),
+ len, meta_ac,
+ dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+ }
out_commit:
ocfs2_commit_trans(osb, handle);
out:
mutex_unlock(&tl_inode->i_mutex);
-
+bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
return ret;
}
@@ -5790,9 +5806,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- mlog_entry("start_blk = %llu, num_clusters = %u\n",
- (unsigned long long)start_blk, num_clusters);
-
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5829,10 +5842,9 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
goto bail;
}
- mlog(0, "Log truncate of %u clusters starting at cluster %u to "
- "%llu (index = %d)\n", num_clusters, start_cluster,
- (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
-
+ trace_ocfs2_truncate_log_append(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+ start_cluster, num_clusters);
if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
/*
* Move index back to the record we are coalescing with.
@@ -5841,23 +5853,20 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
index--;
num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
- mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
- index, le32_to_cpu(tl->tl_recs[index].t_start),
- num_clusters);
+ trace_ocfs2_truncate_log_append(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ index, le32_to_cpu(tl->tl_recs[index].t_start),
+ num_clusters);
} else {
tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
tl->tl_used = cpu_to_le16(index + 1);
}
tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
+ osb->truncated_clusters += num_clusters;
bail:
- mlog_exit(status);
return status;
}
@@ -5876,8 +5885,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
struct inode *tl_inode = osb->osb_tl_inode;
struct buffer_head *tl_bh = osb->osb_tl_bh;
- mlog_entry_void();
-
di = (struct ocfs2_dinode *) tl_bh->b_data;
tl = &di->id2.i_dealloc;
i = le16_to_cpu(tl->tl_used) - 1;
@@ -5893,11 +5900,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
tl->tl_used = cpu_to_le16(i);
- status = ocfs2_journal_dirty(handle, tl_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, tl_bh);
/* TODO: Perhaps we can calculate the bulk of the
* credits up front rather than extending like
@@ -5917,8 +5920,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
/* if start_blk is not set, we ignore the record as
* invalid. */
if (start_blk) {
- mlog(0, "free record %d, start = %u, clusters = %u\n",
- i, le32_to_cpu(rec.t_start), num_clusters);
+ trace_ocfs2_replay_truncate_records(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ i, le32_to_cpu(rec.t_start), num_clusters);
status = ocfs2_free_clusters(handle, data_alloc_inode,
data_alloc_bh, start_blk,
@@ -5931,8 +5935,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
i--;
}
+ osb->truncated_clusters = 0;
+
bail:
- mlog_exit(status);
return status;
}
@@ -5949,8 +5954,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- mlog_entry_void();
-
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -5962,8 +5965,9 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
tl = &di->id2.i_dealloc;
num_to_flush = le16_to_cpu(tl->tl_used);
- mlog(0, "Flush %u records from truncate log #%llu\n",
- num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+ trace_ocfs2_flush_truncate_log(
+ (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+ num_to_flush);
if (!num_to_flush) {
status = 0;
goto out;
@@ -6009,7 +6013,6 @@ out_mutex:
iput(data_alloc_inode);
out:
- mlog_exit(status);
return status;
}
@@ -6032,22 +6035,19 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
container_of(work, struct ocfs2_super,
osb_truncate_log_wq.work);
- mlog_entry_void();
-
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
mlog_errno(status);
else
ocfs2_init_steal_slots(osb);
-
- mlog_exit(status);
}
#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel)
{
- if (osb->osb_tl_inode) {
+ if (osb->osb_tl_inode &&
+ atomic_read(&osb->osb_tl_disable) == 0) {
/* We want to push off log flushes while truncates are
* still running. */
if (cancel)
@@ -6086,7 +6086,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
*tl_inode = inode;
*tl_bh = bh;
bail:
- mlog_exit(status);
return status;
}
@@ -6106,7 +6105,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
*tl_copy = NULL;
- mlog(0, "recover truncate log from slot %d\n", slot_num);
+ trace_ocfs2_begin_truncate_log_recovery(slot_num);
status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
if (status < 0) {
@@ -6123,8 +6122,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
tl = &di->id2.i_dealloc;
if (le16_to_cpu(tl->tl_used)) {
- mlog(0, "We'll have %u logs to recover\n",
- le16_to_cpu(tl->tl_used));
+ trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
if (!(*tl_copy)) {
@@ -6157,9 +6155,9 @@ bail:
if (status < 0 && (*tl_copy)) {
kfree(*tl_copy);
*tl_copy = NULL;
+ mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -6174,8 +6172,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_truncate_log *tl;
- mlog_entry_void();
-
if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
return -EINVAL;
@@ -6183,8 +6179,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
tl = &tl_copy->id2.i_dealloc;
num_recs = le16_to_cpu(tl->tl_used);
- mlog(0, "cleanup %u records from %llu\n", num_recs,
- (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
+ trace_ocfs2_complete_truncate_log_recovery(
+ (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+ num_recs);
mutex_lock(&tl_inode->i_mutex);
for(i = 0; i < num_recs; i++) {
@@ -6219,7 +6216,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
bail_up:
mutex_unlock(&tl_inode->i_mutex);
- mlog_exit(status);
return status;
}
@@ -6228,7 +6224,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
- mlog_entry_void();
+ atomic_set(&osb->osb_tl_disable, 1);
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
@@ -6241,8 +6237,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
brelse(osb->osb_tl_bh);
iput(osb->osb_tl_inode);
}
-
- mlog_exit_void();
}
int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -6251,8 +6245,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
struct inode *tl_inode = NULL;
struct buffer_head *tl_bh = NULL;
- mlog_entry_void();
-
status = ocfs2_get_truncate_log_info(osb,
osb->slot_num,
&tl_inode,
@@ -6265,10 +6257,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* until we're sure all is well. */
INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
ocfs2_truncate_log_worker);
+ atomic_set(&osb->osb_tl_disable, 0);
osb->osb_tl_bh = tl_bh;
osb->osb_tl_inode = tl_inode;
- mlog_exit(status);
return status;
}
@@ -6298,6 +6290,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
*/
struct ocfs2_cached_block_free {
struct ocfs2_cached_block_free *free_next;
+ u64 free_bg;
u64 free_blk;
unsigned int free_bit;
};
@@ -6344,10 +6337,13 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
}
while (head) {
- bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
- head->free_bit);
- mlog(0, "Free bit: (bit %u, blkno %llu)\n",
- head->free_bit, (unsigned long long)head->free_blk);
+ if (head->free_bg)
+ bg_blkno = head->free_bg;
+ else
+ bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+ head->free_bit);
+ trace_ocfs2_free_cached_blocks(
+ (unsigned long long)head->free_blk, head->free_bit);
ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
head->free_bit, bg_blkno, 1);
@@ -6393,15 +6389,14 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
int ret = 0;
struct ocfs2_cached_block_free *item;
- item = kmalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (item == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
return ret;
}
- mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
- bit, (unsigned long long)blkno);
+ trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
item->free_blk = blkno;
item->free_bit = bit;
@@ -6476,8 +6471,8 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
fl = ctxt->c_first_suballocator;
if (fl->f_first) {
- mlog(0, "Free items: (type %u, slot %d)\n",
- fl->f_inode_type, fl->f_slot);
+ trace_ocfs2_run_deallocs(fl->f_inode_type,
+ fl->f_slot);
ret2 = ocfs2_free_cached_blocks(osb,
fl->f_inode_type,
fl->f_slot,
@@ -6533,8 +6528,8 @@ ocfs2_find_per_slot_free_list(int type,
}
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 blkno,
- unsigned int bit)
+ int type, int slot, u64 suballoc,
+ u64 blkno, unsigned int bit)
{
int ret;
struct ocfs2_per_slot_free_list *fl;
@@ -6547,16 +6542,18 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
goto out;
}
- item = kmalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_NOFS);
if (item == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
- type, slot, bit, (unsigned long long)blkno);
+ trace_ocfs2_cache_block_dealloc(type, slot,
+ (unsigned long long)suballoc,
+ (unsigned long long)blkno, bit);
+ item->free_bg = suballoc;
item->free_blk = blkno;
item->free_bit = bit;
item->free_next = fl->f_first;
@@ -6573,433 +6570,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
{
return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(eb->h_suballoc_slot),
+ le64_to_cpu(eb->h_suballoc_loc),
le64_to_cpu(eb->h_blkno),
le16_to_cpu(eb->h_suballoc_bit));
}
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
- unsigned int clusters_to_del,
- struct ocfs2_path *path,
- struct buffer_head **new_last_eb)
-{
- int next_free, ret = 0;
- u32 cpos;
- struct ocfs2_extent_rec *rec;
- struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
- struct buffer_head *bh = NULL;
-
- *new_last_eb = NULL;
-
- /* we have no tree, so of course, no last_eb. */
- if (!path->p_tree_depth)
- goto out;
-
- /* trunc to zero special case - this makes tree_depth = 0
- * regardless of what it is. */
- if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
- goto out;
-
- el = path_leaf_el(path);
- BUG_ON(!el->l_next_free_rec);
-
- /*
- * Make sure that this extent list will actually be empty
- * after we clear away the data. We can shortcut out if
- * there's more than one non-empty extent in the
- * list. Otherwise, a check of the remaining extent is
- * necessary.
- */
- next_free = le16_to_cpu(el->l_next_free_rec);
- rec = NULL;
- if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- if (next_free > 2)
- goto out;
-
- /* We may have a valid extent in index 1, check it. */
- if (next_free == 2)
- rec = &el->l_recs[1];
-
- /*
- * Fall through - no more nonempty extents, so we want
- * to delete this leaf.
- */
- } else {
- if (next_free > 1)
- goto out;
-
- rec = &el->l_recs[0];
- }
-
- if (rec) {
- /*
- * Check it we'll only be trimming off the end of this
- * cluster.
- */
- if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
- goto out;
- }
-
- ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- eb = (struct ocfs2_extent_block *) bh->b_data;
- el = &eb->h_list;
-
- /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
- * Any corruption is a code bug. */
- BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-
- *new_last_eb = bh;
- get_bh(*new_last_eb);
- mlog(0, "returning block %llu, (cpos: %u)\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
- brelse(bh);
-
- return ret;
-}
-
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- * - start journaling of each path component.
- * - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
- handle_t *handle, struct ocfs2_truncate_context *tc,
- u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
- int ret, i, index = path->p_tree_depth;
- u32 new_edge = 0;
- u64 deleted_eb = 0;
- struct buffer_head *bh;
- struct ocfs2_extent_list *el;
- struct ocfs2_extent_rec *rec;
-
- *delete_start = 0;
- *flags = 0;
-
- while (index >= 0) {
- bh = path->p_node[index].bh;
- el = path->p_node[index].el;
-
- mlog(0, "traveling tree (index = %d, block = %llu)\n",
- index, (unsigned long long)bh->b_blocknr);
-
- BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-
- if (index !=
- (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has invalid ext. block %llu",
- inode->i_ino,
- (unsigned long long)bh->b_blocknr);
- ret = -EROFS;
- goto out;
- }
-
-find_tail_record:
- i = le16_to_cpu(el->l_next_free_rec) - 1;
- rec = &el->l_recs[i];
-
- mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
- "next = %u\n", i, le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec),
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-
- if (le16_to_cpu(el->l_tree_depth) == 0) {
- /*
- * If the leaf block contains a single empty
- * extent and no records, we can just remove
- * the block.
- */
- if (i == 0 && ocfs2_is_empty_extent(rec)) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- el->l_next_free_rec = cpu_to_le16(0);
-
- goto delete;
- }
-
- /*
- * Remove any empty extents by shifting things
- * left. That should make life much easier on
- * the code below. This condition is rare
- * enough that we shouldn't see a performance
- * hit.
- */
- if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- for(i = 0;
- i < le16_to_cpu(el->l_next_free_rec); i++)
- el->l_recs[i] = el->l_recs[i + 1];
-
- memset(&el->l_recs[i], 0,
- sizeof(struct ocfs2_extent_rec));
-
- /*
- * We've modified our extent list. The
- * simplest way to handle this change
- * is to being the search from the
- * start again.
- */
- goto find_tail_record;
- }
-
- le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-
- /*
- * We'll use "new_edge" on our way back up the
- * tree to know what our rightmost cpos is.
- */
- new_edge = le16_to_cpu(rec->e_leaf_clusters);
- new_edge += le32_to_cpu(rec->e_cpos);
-
- /*
- * The caller will use this to delete data blocks.
- */
- *delete_start = le64_to_cpu(rec->e_blkno)
- + ocfs2_clusters_to_blocks(inode->i_sb,
- le16_to_cpu(rec->e_leaf_clusters));
- *flags = rec->e_flags;
-
- /*
- * If it's now empty, remove this record.
- */
- if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
- } else {
- if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
- memset(rec, 0,
- sizeof(struct ocfs2_extent_rec));
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- goto delete;
- }
-
- /* Can this actually happen? */
- if (le16_to_cpu(el->l_next_free_rec) == 0)
- goto delete;
-
- /*
- * We never actually deleted any clusters
- * because our leaf was empty. There's no
- * reason to adjust the rightmost edge then.
- */
- if (new_edge == 0)
- goto delete;
-
- rec->e_int_clusters = cpu_to_le32(new_edge);
- le32_add_cpu(&rec->e_int_clusters,
- -le32_to_cpu(rec->e_cpos));
-
- /*
- * A deleted child record should have been
- * caught above.
- */
- BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
- }
-
-delete:
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- mlog(0, "extent list container %llu, after: record %d: "
- "(%u, %u, %llu), next = %u.\n",
- (unsigned long long)bh->b_blocknr, i,
- le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
- (unsigned long long)le64_to_cpu(rec->e_blkno),
- le16_to_cpu(el->l_next_free_rec));
-
- /*
- * We must be careful to only attempt delete of an
- * extent block (and not the root inode block).
- */
- if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
- struct ocfs2_extent_block *eb =
- (struct ocfs2_extent_block *)bh->b_data;
-
- /*
- * Save this for use when processing the
- * parent block.
- */
- deleted_eb = le64_to_cpu(eb->h_blkno);
-
- mlog(0, "deleting this extent block.\n");
-
- ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-
- BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
- BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
- BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-
- ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
- /* An error here is not fatal. */
- if (ret < 0)
- mlog_errno(ret);
- } else {
- deleted_eb = 0;
- }
-
- index--;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
- unsigned int clusters_to_del,
- struct inode *inode,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_truncate_context *tc,
- struct ocfs2_path *path,
- struct ocfs2_alloc_context *meta_ac)
-{
- int status;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *last_eb = NULL;
- struct ocfs2_extent_list *el;
- struct buffer_head *last_eb_bh = NULL;
- u64 delete_blk = 0;
- u8 rec_flags;
-
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
- path, &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- /*
- * Each component will be touched, so we might as well journal
- * here to avoid having to handle errors later.
- */
- status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- if (last_eb_bh) {
- status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- }
-
- el = &(fe->id2.i_list);
-
- /*
- * Lower levels depend on this never happening, but it's best
- * to check it up here before changing the tree.
- */
- if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has an empty extent record, depth %u\n",
- inode->i_ino, le16_to_cpu(el->l_tree_depth));
- status = -EROFS;
- goto bail;
- }
-
- dquot_free_space_nodirty(inode,
- ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
- clusters_to_del;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
- le32_add_cpu(&fe->i_clusters, -clusters_to_del);
- inode->i_blocks = ocfs2_inode_sector_count(inode);
-
- status = ocfs2_trim_tree(inode, path, handle, tc,
- clusters_to_del, &delete_blk, &rec_flags);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- if (le32_to_cpu(fe->i_clusters) == 0) {
- /* trunc to zero is a special case. */
- el->l_tree_depth = 0;
- fe->i_last_eb_blk = 0;
- } else if (last_eb)
- fe->i_last_eb_blk = last_eb->h_blkno;
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- if (last_eb) {
- /* If there will be a new last extent block, then by
- * definition, there cannot be any leaves to the right of
- * him. */
- last_eb->h_next_leaf_blk = 0;
- status = ocfs2_journal_dirty(handle, last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- if (delete_blk) {
- if (rec_flags & OCFS2_EXT_REFCOUNTED)
- status = ocfs2_decrease_refcount(inode, handle,
- ocfs2_blocks_to_clusters(osb->sb,
- delete_blk),
- clusters_to_del, meta_ac,
- &tc->tc_dealloc, 1);
- else
- status = ocfs2_truncate_log_append(osb, handle,
- delete_blk,
- clusters_to_del);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
- status = 0;
-bail:
- brelse(last_eb_bh);
- mlog_exit(status);
- return status;
-}
-
static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
@@ -7091,7 +6666,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
last_page_bytes = PAGE_ALIGN(end);
index = start >> PAGE_CACHE_SHIFT;
do {
- pages[numpages] = grab_cache_page(mapping, index);
+ pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
if (!pages[numpages]) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -7197,8 +6772,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
mlog_errno(ret);
out:
- if (pages)
- kfree(pages);
+ kfree(pages);
return ret;
}
@@ -7252,6 +6826,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
int ret, i, has_data, num_pages = 0;
+ int need_free = 0;
+ u32 bit_off, num;
handle_t *handle;
u64 uninitialized_var(block);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -7297,7 +6873,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- u32 bit_off, num;
unsigned int page_end;
u64 phys;
@@ -7307,7 +6882,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_commit;
did_quota = 1;
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&num);
if (ret) {
mlog_errno(ret);
@@ -7331,6 +6908,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -7341,6 +6919,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -7358,6 +6937,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_dinode_new_extent_list(inode, di);
ocfs2_journal_dirty(handle, di_bh);
@@ -7372,6 +6952,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -7383,6 +6964,18 @@ out_commit:
dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock:
@@ -7406,26 +6999,27 @@ out:
*/
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc)
+ struct buffer_head *di_bh)
{
- int status, i, credits, tl_sem = 0;
- u32 clusters_to_del, new_highest_cpos, range;
+ int status = 0, i, flags = 0;
+ u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
u64 blkno = 0;
struct ocfs2_extent_list *el;
- handle_t *handle = NULL;
- struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_extent_rec *rec;
struct ocfs2_path *path = NULL;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
- struct ocfs2_alloc_context *meta_ac = NULL;
- struct ocfs2_refcount_tree *ref_tree = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *root_el = &(di->id2.i_list);
+ u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+ struct ocfs2_extent_tree et;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
- mlog_entry_void();
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&dealloc);
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+ path = ocfs2_new_path(di_bh, &di->id2.i_list,
ocfs2_journal_access_di);
if (!path) {
status = -ENOMEM;
@@ -7444,8 +7038,6 @@ start:
goto bail;
}
- credits = 0;
-
/*
* Truncate always works against the rightmost tree branch.
*/
@@ -7455,8 +7047,11 @@ start:
goto bail;
}
- mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
- OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+ trace_ocfs2_commit_truncate(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ new_highest_cpos,
+ OCFS2_I(inode)->ip_clusters,
+ path->p_tree_depth);
/*
* By now, el will point to the extent list on the bottom most
@@ -7480,101 +7075,62 @@ start:
}
i = le16_to_cpu(el->l_next_free_rec) - 1;
- range = le32_to_cpu(el->l_recs[i].e_cpos) +
- ocfs2_rec_clusters(el, &el->l_recs[i]);
- if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
- clusters_to_del = 0;
- } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
- clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
- blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+ rec = &el->l_recs[i];
+ flags = rec->e_flags;
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (i == 0 && ocfs2_is_empty_extent(rec)) {
+ /*
+ * Lower levels depend on this never happening, but it's best
+ * to check it up here before changing the tree.
+ */
+ if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+ ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ "extent record, depth %u\n", inode->i_ino,
+ le16_to_cpu(root_el->l_tree_depth));
+ status = -EROFS;
+ goto bail;
+ }
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
+ } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+ /*
+ * Truncate entire record.
+ */
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = ocfs2_rec_clusters(el, rec);
+ blkno = le64_to_cpu(rec->e_blkno);
} else if (range > new_highest_cpos) {
- clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
- le32_to_cpu(el->l_recs[i].e_cpos)) -
- new_highest_cpos;
- blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
- ocfs2_clusters_to_blocks(inode->i_sb,
- ocfs2_rec_clusters(el, &el->l_recs[i]) -
- clusters_to_del);
+ /*
+ * Partial truncate. it also should be
+ * the last truncate we're doing.
+ */
+ trunc_cpos = new_highest_cpos;
+ trunc_len = range - new_highest_cpos;
+ coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+ blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
} else {
+ /*
+ * Truncate completed, leave happily.
+ */
status = 0;
goto bail;
}
- mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
- clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-
- if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
- BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
- OCFS2_HAS_REFCOUNT_FL));
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- status = ocfs2_lock_refcount_tree(osb,
- le64_to_cpu(di->i_refcount_loc),
- 1, &ref_tree, NULL);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
- blkno,
- clusters_to_del,
- &credits,
- &meta_ac);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- mutex_lock(&tl_inode->i_mutex);
- tl_sem = 1;
- /* ocfs2_truncate_log_needs_flush guarantees us at least one
- * record is free for use. If there isn't any, we flush to get
- * an empty truncate log. */
- if (ocfs2_truncate_log_needs_flush(osb)) {
- status = __ocfs2_flush_truncate_log(osb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
- credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
- (struct ocfs2_dinode *)fe_bh->b_data,
- el);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto bail;
- }
-
- status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
- tc, path, meta_ac);
+ status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags, &dealloc,
+ refcount_loc);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- mutex_unlock(&tl_inode->i_mutex);
- tl_sem = 0;
-
- ocfs2_commit_trans(osb, handle);
- handle = NULL;
-
ocfs2_reinit_path(path, 1);
- if (meta_ac) {
- ocfs2_free_alloc_context(meta_ac);
- meta_ac = NULL;
- }
-
- if (ref_tree) {
- ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
- ref_tree = NULL;
- }
-
/*
* The check above will catch the case where we've truncated
* away all allocation.
@@ -7585,84 +7141,10 @@ bail:
ocfs2_schedule_truncate_log_flush(osb, 1);
- if (tl_sem)
- mutex_unlock(&tl_inode->i_mutex);
-
- if (handle)
- ocfs2_commit_trans(osb, handle);
-
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
- if (ref_tree)
- ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-
- ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+ ocfs2_run_deallocs(osb, &dealloc);
ocfs2_free_path(path);
- /* This will drop the ext_alloc cluster lock for us */
- ocfs2_free_truncate_context(tc);
-
- mlog_exit(status);
- return status;
-}
-
-/*
- * Expects the inode to already be locked.
- */
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc)
-{
- int status;
- unsigned int new_i_clusters;
- struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
- struct buffer_head *last_eb_bh = NULL;
-
- mlog_entry_void();
-
- *tc = NULL;
-
- new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
- i_size_read(inode));
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
- mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
- "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
- (unsigned long long)le64_to_cpu(fe->i_size));
-
- *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
- if (!(*tc)) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
- ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
-
- if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_extent_block(INODE_CACHE(inode),
- le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- }
-
- (*tc)->tc_last_eb_bh = last_eb_bh;
-
- status = 0;
-bail:
- if (status < 0) {
- if (*tc)
- ocfs2_free_truncate_context(*tc);
- *tc = NULL;
- }
- mlog_exit_void();
return status;
}
@@ -7682,7 +7164,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
if (end > i_size_read(inode))
end = i_size_read(inode);
- BUG_ON(start >= end);
+ BUG_ON(start > end);
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
@@ -7732,6 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
@@ -7741,17 +7224,163 @@ out:
return ret;
}
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 count)
{
- /*
- * The caller is responsible for completing deallocation
- * before freeing the context.
- */
- if (tc->tc_dealloc.c_first_suballocator != NULL)
- mlog(ML_NOTICE,
- "Truncate completion has non-empty dealloc context\n");
+ u64 discard, bcount;
+
+ bcount = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+ return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ u32 start, u32 max, u32 minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
+
+ if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+ return 0;
+
+ trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+ start, max, minbits);
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
+ }
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
+ }
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
- brelse(tc->tc_last_eb_bh);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, trimmed, first_group, last_group, group;
+ int ret, cnt;
+ u32 first_bit, last_bit, minlen;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+
+ if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
+ return -EINVAL;
- kfree(tc);
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = range->len >> osb->s_clustersize_bits;
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ trace_ocfs2_trim_fs(start, len, minlen);
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ trimmed = 0;
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg;
+ else
+ last_bit = first_bit + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb9..ca381c58412 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_extent_tree *et,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc);
+ u32 cpos, u32 phys_cpos, u32 len, int flags,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u64 refcount_loc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
u64 blkno, unsigned int bit);
int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
- int type, int slot, u64 blkno,
+ int type, int slot, u64 suballoc, u64 blkno,
unsigned int bit);
static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
{
@@ -227,14 +228,9 @@ struct ocfs2_truncate_context {
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
u64 range_start, u64 range_end);
-int ocfs2_prepare_truncate(struct ocfs2_super *osb,
- struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context **tc);
int ocfs2_commit_truncate(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
- struct ocfs2_truncate_context *tc);
+ struct buffer_head *di_bh);
int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
unsigned int start, unsigned int end, int trunc);
@@ -243,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
@@ -319,6 +316,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
struct ocfs2_path *path);
int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos);
int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
struct ocfs2_path *left,
struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb550..4a231a166cf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
#include <linux/mpage.h>
#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -45,6 +44,7 @@
#include "super.h"
#include "symlink.h"
#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
void *kaddr;
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_symlink_get_block(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
@@ -79,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
+ err = -ENOMEM;
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
@@ -91,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
+ err = -ENOMEM;
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
@@ -101,7 +104,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh_result->b_page);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
@@ -109,7 +112,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -123,7 +126,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
bail:
brelse(bh);
- mlog_exit(err);
return err;
}
@@ -136,8 +138,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
u64 p_blkno, count, past_eof;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -165,7 +167,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we're extending i_size on a file
* system which doesn't support holes, in which case BH_New
- * allows block_prepare_write() to zero.
+ * allows __block_write_begin() to zero.
*
* If we see this on a sparse file system, then a truncate has
* raced us and removed the cluster. In this case, we clear
@@ -196,20 +198,19 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
+ }
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
- }
+ trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
bail:
if (err < 0)
err = -EIO;
- mlog_exit(err);
return err;
}
@@ -237,13 +238,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
return -EROFS;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (size)
memcpy(kaddr, di->id2.i_data.id_data, size);
/* Clear the remaining part of the page */
memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
SetPageUptodate(page);
@@ -279,7 +280,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
int ret, unlock = 1;
- mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+ (page ? page->index : 0));
ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
@@ -290,7 +292,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
@@ -324,7 +334,6 @@ out_inode_unlock:
out:
if (unlock)
unlock_page(page);
- mlog_exit(ret);
return ret;
}
@@ -397,30 +406,11 @@ out_unlock:
*/
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret;
-
- mlog_entry("(0x%p)\n", page);
-
- ret = block_write_full_page(page, ocfs2_get_block, wbc);
-
- mlog_exit(ret);
-
- return ret;
-}
-
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to)
-{
- int ret;
-
- ret = block_prepare_write(page, from, to, ocfs2_get_block);
+ trace_ocfs2_writepage(
+ (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+ page->index);
- return ret;
+ return block_write_full_page(page, ocfs2_get_block, wbc);
}
/* Taken from ext3. We don't necessarily need the full blown
@@ -459,36 +449,6 @@ int walk_page_buffers( handle_t *handle,
return ret;
}
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
- struct page *page,
- unsigned from,
- unsigned to)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle;
- int ret = 0;
-
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
- mlog_errno(ret);
- }
-out:
- if (ret) {
- if (!IS_ERR(handle))
- ocfs2_commit_trans(osb, handle);
- handle = ERR_PTR(ret);
- }
- return handle;
-}
-
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
{
sector_t status;
@@ -496,7 +456,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
int err = 0;
struct inode *inode = mapping->host;
- mlog_entry("(block = %llu)\n", (unsigned long long)block);
+ trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)block);
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
@@ -530,8 +491,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
bail:
status = err ? 0 : p_blkno;
- mlog_exit((int)status);
-
return status;
}
@@ -602,61 +561,49 @@ bail:
/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
void *private)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
int level;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+ if (ocfs2_iocb_is_sem_locked(iocb))
+ ocfs2_iocb_clear_sem_locked(iocb);
+
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+ }
+
ocfs2_iocb_clear_rw_locked(iocb);
level = ocfs2_iocb_rw_locked_level(iocb);
- if (!level)
- up_read(&inode->i_alloc_sem);
ocfs2_rw_unlock(inode, level);
}
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3. PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
-{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
- jbd2_journal_invalidatepage(journal, page, offset);
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
if (!page_has_buffers(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return try_to_free_buffers(page);
}
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
- int ret;
-
- mlog_entry_void();
+ struct inode *inode = file_inode(file)->i_mapping->host;
/*
* Fallback to buffered I/O if we see an inode without
@@ -669,14 +616,10 @@ static ssize_t ocfs2_direct_IO(int rw,
if (i_size_read(inode) <= offset)
return 0;
- ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
- inode->i_sb->s_bdev, iov, offset,
- nr_segs,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io);
-
- mlog_exit(ret);
- return ret;
+ return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -723,7 +666,7 @@ static void ocfs2_clear_page_regions(struct page *page,
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (from || to) {
if (from > cluster_start)
@@ -734,7 +677,7 @@ static void ocfs2_clear_page_regions(struct page *page,
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
/*
@@ -759,7 +702,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
}
/*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
* mapping by now though, and the entire write will be allocating or
* it won't, so not much need to use BH_New.
*
@@ -910,11 +853,17 @@ struct ocfs2_write_ctxt {
* out in so that future reads from that region will get
* zero's.
*/
- struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
unsigned int w_num_pages;
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;
/*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
*/
@@ -947,6 +896,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
brelse(wc->w_di_bh);
@@ -1065,6 +1032,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
&cluster_start, &cluster_end);
+ /* treat the write as new if the a hole/lseek spanned across
+ * the page boundary.
+ */
+ new = new | ((i_size_read(inode) <= page_offset(page)) &&
+ (page_offset(page) <= user_pos));
+
if (page == wc->w_target_page) {
map_from = user_pos & (PAGE_CACHE_SIZE - 1);
map_to = map_from + user_len;
@@ -1131,23 +1104,37 @@ out:
*/
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct ocfs2_write_ctxt *wc,
- u32 cpos, loff_t user_pos, int new,
+ u32 cpos, loff_t user_pos,
+ unsigned user_len, int new,
struct page *mmap_page)
{
int ret = 0, i;
- unsigned long start, target_index, index;
+ unsigned long start, target_index, end_index, index;
struct inode *inode = mapping->host;
+ loff_t last_byte;
target_index = user_pos >> PAGE_CACHE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
- * page. Otherwise, we'll need a whole clusters worth.
+ * page. Otherwise, we'll need a whole clusters worth. If we're
+ * writing past i_size, we only need enough pages to cover the
+ * last page of the write.
*/
if (new) {
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+ /*
+ * We need the index *past* the last page we could possibly
+ * touch. This is the page past the end of the write or
+ * i_size, whichever is greater.
+ */
+ last_byte = max(user_pos + user_len, i_size_read(inode));
+ BUG_ON(last_byte < 1);
+ end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ if ((start + wc->w_num_pages) > end_index)
+ wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
@@ -1164,20 +1151,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1187,11 +1171,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
goto out;
}
}
+ wait_for_stable_page(wc->w_pages[i]);
if (index == target_index)
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
@@ -1559,9 +1546,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = NULL;
- mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
- (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
- oi->ip_dyn_features);
+ trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+ len, (unsigned long long)pos,
+ oi->ip_dyn_features);
/*
* Handle inodes which already have inline data 1st.
@@ -1620,21 +1607,20 @@ out:
* write path can treat it as an non-allocating write, which has no
* special case code for sparse/nonsparse files.
*/
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
- unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, unsigned len,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;
- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, pos);
+ ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
if (ret)
mlog_errno(ret);
@@ -1644,13 +1630,63 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
return ret;
}
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int ret = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+ if (pos > i_size_read(inode))
+ ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+ return ret;
+}
+
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ mutex_lock(&osb->osb_tl_inode->i_mutex);
+ truncated_clusters = osb->truncated_clusters;
+ mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
- unsigned int clusters_to_alloc, extents_to_split;
+ unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
struct ocfs2_write_ctxt *wc;
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1659,7 +1695,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
struct ocfs2_extent_tree et;
+ int try_free = 1, ret1;
+try_again:
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
mlog_errno(ret);
@@ -1679,7 +1717,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}
- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+ wc);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1690,6 +1732,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
mlog_errno(ret);
goto out;
} else if (ret == 1) {
+ clusters_need = wc->w_clen;
ret = ocfs2_refcount_cow(inode, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
@@ -1704,9 +1747,17 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
+ clusters_need += clusters_to_alloc;
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ trace_ocfs2_write_begin_nolock(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode),
+ le32_to_cpu(di->i_clusters),
+ pos, len, flags, mmap_page,
+ clusters_to_alloc, extents_to_split);
+
/*
* We set w_target_from, w_target_to here so that
* ocfs2_write_end() knows which range in the target page to
@@ -1719,12 +1770,6 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* ocfs2_lock_allocators(). It greatly over-estimates
* the work to be done.
*/
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
- " clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
- clusters_to_alloc, extents_to_split);
-
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_lock_allocators(inode, &et,
@@ -1735,9 +1780,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
goto out;
}
+ if (data_ac)
+ data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &di->id2.i_list,
- clusters_to_alloc);
+ &di->id2.i_list);
}
@@ -1786,13 +1833,25 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* that we can zero and flush if we error after adding the
* extent.
*/
- ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret) {
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
@@ -1819,10 +1878,30 @@ out_commit:
out:
ocfs2_free_write_ctxt(wc);
- if (data_ac)
+ if (data_ac) {
ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
+ data_ac = NULL;
+ }
+ if (meta_ac) {
ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+
+ if (ret == -ENOSPC && try_free) {
+ /*
+ * Try to free some truncate log so that we can have enough
+ * clusters to allocate.
+ */
+ try_free = 0;
+
+ ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+ if (ret1 == 1)
+ goto try_again;
+
+ if (ret1 < 0)
+ mlog_errno(ret1);
+ }
+
return ret;
}
@@ -1849,7 +1928,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
@@ -1883,12 +1962,12 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
}
- kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+ kaddr = kmap_atomic(wc->w_target_page);
memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- mlog(0, "Data written to inode at offset %llu. "
- "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+ trace_ocfs2_write_end_inline(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)pos, *copied,
le16_to_cpu(di->id2.i_data.id_count),
le16_to_cpu(di->i_dyn_features));
@@ -1950,7 +2029,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
out_write_size:
pos += copied;
- if (pos > inode->i_size) {
+ if (pos > i_size_read(inode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
@@ -1959,6 +2038,7 @@ out_write_size:
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
ocfs2_commit_trans(osb, handle);
@@ -1992,9 +2072,8 @@ const struct address_space_operations ocfs2_aops = {
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
- .sync_page = block_sync_page,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = ocfs2_invalidatepage,
+ .invalidatepage = block_invalidatepage,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc51..6cae155d54d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,8 +22,7 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to);
+#include <linux/aio.h>
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
@@ -48,7 +47,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
@@ -70,8 +70,36 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
else
clear_bit(1, (unsigned long *)&iocb->private);
}
+
+/*
+ * Using a named enum representing lock types in terms of #N bit stored in
+ * iocb->private, which is going to be used for communication between
+ * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
+ */
+enum ocfs2_iocb_lock_bits {
+ OCFS2_IOCB_RW_LOCK = 0,
+ OCFS2_IOCB_RW_LOCK_LEVEL,
+ OCFS2_IOCB_SEM,
+ OCFS2_IOCB_UNALIGNED_IO,
+ OCFS2_IOCB_NUM_LOCKS
+};
+
#define ocfs2_iocb_clear_rw_locked(iocb) \
- clear_bit(0, (unsigned long *)&iocb->private)
+ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
- test_bit(1, (unsigned long *)&iocb->private)
+ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_set_sem_locked(iocb) \
+ set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_sem_locked(iocb) \
+ clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_sem_locked(iocb) \
+ test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+ clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3..0725e605465 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
* No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
* larger than 16 bits.
*/
- BUG_ON(ecc > USHORT_MAX);
+ BUG_ON(ecc > USHRT_MAX);
bc->bc_crc32e = cpu_to_le32(crc);
bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -422,45 +422,46 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
struct ocfs2_blockcheck_stats *stats)
{
int rc = 0;
- struct ocfs2_block_check check;
+ u32 bc_crc32e;
+ u16 bc_ecc;
u32 crc, ecc;
ocfs2_blockcheck_inc_check(stats);
- check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
- check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
crc = crc32_le(~0, data, blocksize);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e)
goto out;
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
- "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
ecc = ocfs2_hamming_encode_block(data, blocksize);
- ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+ ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
/* And check the crc32 again */
crc = crc32_le(~0, data, blocksize);
- if (crc == check.bc_crc32e) {
+ if (crc == bc_crc32e) {
ocfs2_blockcheck_inc_recover(stats);
goto out;
}
- mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
+ (unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
- bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
- bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
@@ -508,7 +509,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
* No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
* larger than 16 bits.
*/
- BUG_ON(ecc > USHORT_MAX);
+ BUG_ON(ecc > USHRT_MAX);
bc->bc_crc32e = cpu_to_le32(crc);
bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -528,7 +529,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
struct ocfs2_blockcheck_stats *stats)
{
int i, rc = 0;
- struct ocfs2_block_check check;
+ u32 bc_crc32e;
+ u16 bc_ecc;
u32 crc, ecc, fix;
BUG_ON(nr < 0);
@@ -538,21 +540,21 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
ocfs2_blockcheck_inc_check(stats);
- check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
- check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+ bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ bc_ecc = le16_to_cpu(bc->bc_ecc);
memset(bc, 0, sizeof(struct ocfs2_block_check));
/* Fast path - if the crc32 validates, we're good to go */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
- if (crc == check.bc_crc32e)
+ if (crc == bc_crc32e)
goto out;
ocfs2_blockcheck_inc_failure(stats);
mlog(ML_ERROR,
"CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
/* Ok, try ECC fixups */
for (i = 0, ecc = 0; i < nr; i++) {
@@ -565,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
bhs[i]->b_size * 8,
bhs[i]->b_size * 8 * i);
}
- fix = ecc ^ check.bc_ecc;
+ fix = ecc ^ bc_ecc;
for (i = 0; i < nr; i++) {
/*
* Try the fix against each buffer. It will only affect
@@ -578,19 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
/* And check the crc32 again */
for (i = 0, crc = ~0; i < nr; i++)
crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
- if (crc == check.bc_crc32e) {
+ if (crc == bc_crc32e) {
ocfs2_blockcheck_inc_recover(stats);
goto out;
}
mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
- (unsigned int)check.bc_crc32e, (unsigned int)crc);
+ (unsigned int)bc_crc32e, (unsigned int)crc);
rc = -EIO;
out:
- bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
- bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+ bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(bc_ecc);
return rc;
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d..1edcb141f63 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <cluster/masklog.h>
@@ -36,8 +35,8 @@
#include "inode.h"
#include "journal.h"
#include "uptodate.h"
-
#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
/*
* Bits on bh->b_state used by ocfs2.
@@ -56,8 +55,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
{
int ret = 0;
- mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
- (unsigned long long)bh->b_blocknr, ci);
+ trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
BUG_ON(buffer_jbd(bh));
@@ -67,6 +65,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* can get modified during recovery even if read-only. */
if (ocfs2_is_hard_readonly(osb)) {
ret = -EROFS;
+ mlog_errno(ret);
goto out;
}
@@ -91,12 +90,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- put_bh(bh);
+ mlog_errno(ret);
}
ocfs2_metadata_cache_io_unlock(ci);
out:
- mlog_exit(ret);
return ret;
}
@@ -107,16 +105,16 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int i;
struct buffer_head *bh;
- if (!nr) {
- mlog(ML_BH_IO, "No buffers will be read!\n");
+ trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+ if (!nr)
goto bail;
- }
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -124,10 +122,8 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
bh = bhs[i];
if (buffer_jbd(bh)) {
- mlog(ML_BH_IO,
- "trying to sync read a jbd "
- "managed bh (blocknr = %llu), skipping\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_read_blocks_sync_jbd(
+ (unsigned long long)bh->b_blocknr);
continue;
}
@@ -187,8 +183,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
struct buffer_head *bh;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
- mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
- ci, (unsigned long long)block, nr, flags);
+ trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
BUG_ON(!ci);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
@@ -208,7 +203,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
if (nr == 0) {
- mlog(ML_BH_IO, "No buffers will be read!\n");
status = 0;
goto bail;
}
@@ -219,7 +213,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
ocfs2_metadata_cache_io_unlock(ci);
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -252,8 +246,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
*/
if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
- mlog(ML_UPTODATE,
- "bh (%llu), owner %llu not uptodate\n",
+ trace_ocfs2_read_blocks_from_disk(
(unsigned long long)bh->b_blocknr,
(unsigned long long)ocfs2_metadata_cache_owner(ci));
/* We're using ignore_cache here to say
@@ -261,11 +254,10 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
ignore_cache = 1;
}
+ trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+ ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
if (buffer_jbd(bh)) {
- if (ignore_cache)
- mlog(ML_BH_IO, "trying to sync read a jbd "
- "managed bh (blocknr = %llu)\n",
- (unsigned long long)bh->b_blocknr);
continue;
}
@@ -273,9 +265,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
- mlog(ML_BH_IO, "asking me to sync read a dirty "
- "buffer! (blocknr = %llu)\n",
- (unsigned long long)bh->b_blocknr);
continue;
}
@@ -368,14 +357,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
}
ocfs2_metadata_cache_io_unlock(ci);
- mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
- (unsigned long long)block, nr,
- ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
- flags);
+ trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+ flags, ignore_cache);
bail:
- mlog_exit(status);
return status;
}
@@ -407,14 +393,14 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh)
{
int ret = 0;
-
- mlog_entry_void();
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
BUG_ON(buffer_jbd(bh));
ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
ret = -EROFS;
+ mlog_errno(ret);
goto out;
}
@@ -426,16 +412,16 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
submit_bh(WRITE, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ret = -EIO;
- put_bh(bh);
+ mlog_errno(ret);
}
out:
- mlog_exit(ret);
return ret;
}
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d860..1aefc0350ec 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o netdebug.o ver.o
+ quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c989000670..73039295d0d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,8 @@
#include <linux/crc32.h>
#include <linux/time.h>
#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -61,10 +63,53 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * - o2hb_region_bitmap allows us to limit the region number to max region.
+ * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * heartbeat on it.
+ * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES 0
+#define O2HB_DB_TYPE_LIVEREGIONS 1
+#define O2HB_DB_TYPE_QUORUMREGIONS 2
+#define O2HB_DB_TYPE_FAILEDREGIONS 3
+#define O2HB_DB_TYPE_REGION_LIVENODES 4
+#define O2HB_DB_TYPE_REGION_NUMBER 5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
+#define O2HB_DB_TYPE_REGION_PINNED 7
+struct o2hb_debug_buf {
+ int db_type;
+ int db_size;
+ int db_len;
+ void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
#define O2HB_DEBUG_DIR "o2hb"
#define O2HB_DEBUG_LIVENODES "livenodes"
+#define O2HB_DEBUG_LIVEREGIONS "live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER "num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+#define O2HB_DEBUG_REGION_PINNED "pinned"
+
static struct dentry *o2hb_debug_dir;
static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
static LIST_HEAD(o2hb_all_regions);
@@ -76,7 +121,46 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
#define O2HB_DEFAULT_BLOCK_BITS 9
+enum o2hb_heartbeat_modes {
+ O2HB_HEARTBEAT_LOCAL = 0,
+ O2HB_HEARTBEAT_GLOBAL,
+ O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
+};
+
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
+
+/*
+ * o2hb_dependent_users tracks the number of registered callbacks that depend
+ * on heartbeat. o2net and o2dlm are two entities that register this callback.
+ * However only o2dlm depends on the heartbeat. It does not want the heartbeat
+ * to stop while a dlm domain is still active.
+ */
+unsigned int o2hb_dependent_users;
+
+/*
+ * In global heartbeat mode, all regions are pinned if there are one or more
+ * dependent users and the quorum region count is <= O2HB_PIN_CUT_OFF. All
+ * regions are unpinned if the region count exceeds the cut off or the number
+ * of dependent users falls to zero.
+ */
+#define O2HB_PIN_CUT_OFF 3
+
+/*
+ * In local heartbeat mode, we assume the dlm domain name to be the same as
+ * region uuid. This is true for domains created for the file system but not
+ * necessarily true for userdlm domains. This is a known limitation.
+ *
+ * In global heartbeat mode, we pin/unpin all o2hb regions. This solution
+ * works for both file system and userdlm domains.
+ */
+static int o2hb_region_pin(const char *region_uuid);
+static void o2hb_region_unpin(const char *region_uuid);
/* Only sets a new threshold if there are no active regions.
*
@@ -93,6 +177,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
+{
+ int ret = -1;
+
+ if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+ spin_lock(&o2hb_live_lock);
+ if (list_empty(&o2hb_all_regions)) {
+ o2hb_heartbeat_mode = hb_mode;
+ ret = 0;
+ }
+ spin_unlock(&o2hb_live_lock);
+ }
+
+ return ret;
+}
+
struct o2hb_node_event {
struct list_head hn_item;
enum o2hb_callback_type hn_event_type;
@@ -116,7 +216,10 @@ struct o2hb_region {
struct config_item hr_item;
struct list_head hr_all_item;
- unsigned hr_unclean_stop:1;
+ unsigned hr_unclean_stop:1,
+ hr_aborted_start:1,
+ hr_item_pinned:1,
+ hr_item_dropped:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -134,11 +237,29 @@ struct o2hb_region {
struct block_device *hr_bdev;
struct o2hb_disk_slot *hr_slots;
+ /* live node map of this region */
+ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned int hr_region_num;
+
+ struct dentry *hr_debug_dir;
+ struct dentry *hr_debug_livenodes;
+ struct dentry *hr_debug_regnum;
+ struct dentry *hr_debug_elapsed_time;
+ struct dentry *hr_debug_pinned;
+ struct o2hb_debug_buf *hr_db_livenodes;
+ struct o2hb_debug_buf *hr_db_regnum;
+ struct o2hb_debug_buf *hr_db_elapsed_time;
+ struct o2hb_debug_buf *hr_db_pinned;
+
/* let the person setting up hb wait for it to return until it
* has reached a 'steady' state. This will be fixed when we have
* a more complete api that doesn't lead to this sort of fragility. */
atomic_t hr_steady_iterations;
+ /* terminate o2hb thread if it does not reach steady state
+ * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+ atomic_t hr_unsteady_iterations;
+
char hr_dev_name[BDEVNAME_SIZE];
unsigned int hr_timeout_ms;
@@ -164,6 +285,8 @@ struct o2hb_bio_wait_ctxt {
static void o2hb_write_timeout(struct work_struct *work)
{
+ int failed, quorum;
+ unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -171,14 +294,45 @@ static void o2hb_write_timeout(struct work_struct *work)
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
"milliseconds\n", reg->hr_dev_name,
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ failed = bitmap_weight(o2hb_failed_region_bitmap,
+ O2NM_MAX_REGIONS);
+ quorum = bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS);
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+ mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+ quorum, failed);
+
+ /*
+ * Fence if the number of failed regions >= half the number
+ * of quorum regions
+ */
+ if ((failed << 1) < quorum)
+ return;
+ }
+
o2quo_disk_timeout();
}
static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
+ /* Arm writeout only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
O2HB_MAX_WRITE_TIMEOUT_MS);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ }
cancel_delayed_work(&reg->hr_write_timeout_work);
reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -187,8 +341,7 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
{
- cancel_delayed_work(&reg->hr_write_timeout_work);
- flush_scheduled_work();
+ cancel_delayed_work_sync(&reg->hr_write_timeout_work);
}
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
@@ -215,11 +368,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
static void o2hb_wait_on_io(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc)
{
- struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-
- blk_run_address_space(mapping);
o2hb_bio_wait_dec(wc, 1);
-
wait_for_completion(&wc->wc_io_complete);
}
@@ -264,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
}
/* Must put everything in 512 byte sectors for the bio... */
- bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
bio->bi_bdev = reg->hr_bdev;
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
@@ -343,7 +492,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
}
atomic_inc(&write_wc->wc_num_reqs);
- submit_bio(WRITE, bio);
+ submit_bio(WRITE_SYNC, bio);
status = 0;
bail:
@@ -389,27 +538,50 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
return read == computed;
}
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static int o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
{
- int node_num, ret;
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
+ char *errstr;
- node_num = o2nm_this_node();
-
- ret = 1;
- slot = &reg->hr_slots[node_num];
+ slot = &reg->hr_slots[o2nm_this_node()];
/* Don't check on our 1st timestamp */
- if (slot->ds_last_time) {
- hb_block = slot->ds_raw_block;
+ if (!slot->ds_last_time)
+ return 0;
- if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
- ret = 0;
- }
+ hb_block = slot->ds_raw_block;
+ if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
+ le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
+ hb_block->hb_node == slot->ds_node_num)
+ return 1;
+
+#define ERRSTR1 "Another node is heartbeating on device"
+#define ERRSTR2 "Heartbeat generation mismatch on device"
+#define ERRSTR3 "Heartbeat sequence mismatch on device"
+
+ if (hb_block->hb_node != slot->ds_node_num)
+ errstr = ERRSTR1;
+ else if (le64_to_cpu(hb_block->hb_generation) !=
+ slot->ds_last_generation)
+ errstr = ERRSTR2;
+ else
+ errstr = ERRSTR3;
- return ret;
+ mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+ slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
+ (unsigned long long)slot->ds_last_time, hb_block->hb_node,
+ (unsigned long long)le64_to_cpu(hb_block->hb_generation),
+ (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+ return 0;
}
static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -448,11 +620,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
struct o2nm_node *node,
int idx)
{
- struct list_head *iter;
struct o2hb_callback_func *f;
- list_for_each(iter, &hbcall->list) {
- f = list_entry(iter, struct o2hb_callback_func, hc_item);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
(f->hc_func)(node, idx, f->hc_data);
}
@@ -461,16 +631,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
/* Will run the list in order until we process the passed event */
static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
{
- int empty;
struct o2hb_callback *hbcall;
struct o2hb_node_event *event;
- spin_lock(&o2hb_live_lock);
- empty = list_empty(&queued_event->hn_item);
- spin_unlock(&o2hb_live_lock);
- if (empty)
- return;
-
/* Holding callback sem assures we don't alter the callback
* lists when doing this, and serializes ourselves with other
* processes wanting callbacks. */
@@ -512,6 +675,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
{
assert_spin_locked(&o2hb_live_lock);
+ BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
event->hn_event_type = type;
event->hn_node = node;
event->hn_node_num = node_num;
@@ -527,6 +692,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
struct o2hb_node_event event =
{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
struct o2nm_node *node;
+ int queued = 0;
node = o2nm_get_node_by_num(slot->ds_node_num);
if (!node)
@@ -544,15 +710,60 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
slot->ds_node_num);
+ queued = 1;
}
}
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
o2nm_node_put(node);
}
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
+{
+ if (!o2hb_global_heartbeat_active())
+ return;
+
+ /* Prevent race with o2hb_heartbeat_group_drop_item() */
+ if (kthread_should_stop())
+ return;
+
+ /* Tag region as quorum only after thread reaches steady state */
+ if (atomic_read(&reg->hr_steady_iterations) != 0)
+ return;
+
+ spin_lock(&o2hb_live_lock);
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ goto unlock;
+
+ /*
+ * A region can be added to the quorum only when it sees all
+ * live nodes heartbeat on it. In other words, the region has been
+ * added to all nodes.
+ */
+ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ sizeof(o2hb_live_node_bitmap)))
+ goto unlock;
+
+ printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+ set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+
+ /*
+ * If global heartbeat active, unpin all regions if the
+ * region count > CUT_OFF
+ */
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
+ o2hb_region_unpin(NULL);
+unlock:
+ spin_unlock(&o2hb_live_lock);
+}
+
static int o2hb_check_slot(struct o2hb_region *reg,
struct o2hb_disk_slot *slot)
{
@@ -564,14 +775,23 @@ static int o2hb_check_slot(struct o2hb_region *reg,
u64 cputime;
unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
unsigned int slot_dead_ms;
+ int tmp;
+ int queued = 0;
memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
- /* Is this correct? Do we assume that the node doesn't exist
- * if we're not configured for him? */
+ /*
+ * If a node is no longer configured but is still in the livemap, we
+ * may need to clear that bit from the livemap.
+ */
node = o2nm_get_node_by_num(slot->ds_node_num);
- if (!node)
- return 0;
+ if (!node) {
+ spin_lock(&o2hb_live_lock);
+ tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ if (!tmp)
+ return 0;
+ }
if (!o2hb_verify_crc(reg, hb_block)) {
/* all paths from here will drop o2hb_live_lock for
@@ -638,14 +858,19 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
slot->ds_node_num, (long long)slot->ds_last_generation);
+ set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+ "bitmap\n", slot->ds_node_num);
set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
slot->ds_node_num);
changed = 1;
+ queued = 1;
}
list_add_tail(&slot->ds_live_item,
@@ -683,15 +908,21 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d left my region\n",
slot->ds_node_num);
+ clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* last off the live_slot generates a callback */
list_del_init(&slot->ds_live_item);
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+ "nodes bitmap\n", slot->ds_node_num);
clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
- o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
- slot->ds_node_num);
+ /* node can be null */
+ o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+ node, slot->ds_node_num);
changed = 1;
+ queued = 1;
}
/* We don't clear this because the node is still
@@ -707,48 +938,50 @@ fire_callbacks:
out:
spin_unlock(&o2hb_live_lock);
- o2hb_run_event_list(&event);
+ if (queued)
+ o2hb_run_event_list(&event);
- o2nm_node_put(node);
+ if (node)
+ o2nm_node_put(node);
return changed;
}
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
- int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
{
- int highest, node;
-
- highest = numbits;
- node = -1;
- while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
- if (node >= numbits)
- break;
-
- highest = node;
- }
-
- return highest;
+ return find_last_bit(nodes, numbits);
}
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
- int i, ret, highest_node, change = 0;
+ int i, ret, highest_node;
+ int membership_change = 0, own_slot_ok = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
ret = o2nm_configured_node_map(configured_nodes,
sizeof(configured_nodes));
if (ret) {
mlog_errno(ret);
- return ret;
+ goto bail;
+ }
+
+ /*
+ * If a node is not configured but is in the livemap, we still need
+ * to read the slot so as to be able to remove it from the livemap.
+ */
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ set_bit(i, configured_nodes);
}
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return -EINVAL;
+ mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+ ret = -EINVAL;
+ goto bail;
}
/* No sense in reading the slots of nodes that don't exist
@@ -758,31 +991,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
/* With an up to date view of the slots, we can check that no
* other node has been improperly configured to heartbeat in
* our slot. */
- if (!o2hb_check_last_timestamp(reg))
- mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
- "in our slot!\n", reg->hr_dev_name);
+ own_slot_ok = o2hb_check_own_slot(reg);
/* fill in the proper info for our next heartbeat */
o2hb_prepare_block(reg, reg->hr_generation);
- /* And fire off the write. Note that we don't wait on this I/O
- * until later. */
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto bail;
}
i = -1;
- while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
-
- change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+ while((i = find_next_bit(configured_nodes,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
}
/*
@@ -797,18 +1026,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* disk */
mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
write_wc.wc_error, reg->hr_dev_name);
- return write_wc.wc_error;
+ ret = write_wc.wc_error;
+ goto bail;
}
- o2hb_arm_write_timeout(reg);
+ /* Skip disarming the timeout if own slot has stale/bad data */
+ if (own_slot_ok) {
+ o2hb_set_quorum_device(reg);
+ o2hb_arm_write_timeout(reg);
+ }
+bail:
/* let the person who launched us know when things are steady */
- if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
- if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (!ret && own_slot_ok && !membership_change) {
+ if (atomic_dec_and_test(&reg->hr_steady_iterations))
+ wake_up(&o2hb_steady_queue);
+ }
+ }
+
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+ printk(KERN_NOTICE "o2hb: Unable to stabilize "
+ "heartbeart on region %s (%s)\n",
+ config_item_name(&reg->hr_item),
+ reg->hr_dev_name);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
+ ret = -EIO;
+ }
}
- return 0;
+ return ret;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -857,9 +1107,13 @@ static int o2hb_thread(void *data)
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
- set_user_nice(current, -20);
+ set_user_nice(current, MIN_NICE);
+
+ /* Pin node */
+ o2nm_depend_this_node();
- while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+ while (!kthread_should_stop() &&
+ !reg->hr_unclean_stop && !reg->hr_aborted_start) {
/* We track the time spent inside
* o2hb_do_disk_heartbeat so that we avoid more than
* hr_timeout_ms between disk writes. On busy systems
@@ -867,10 +1121,7 @@ static int o2hb_thread(void *data)
* likely to time itself out. */
do_gettimeofday(&before_hb);
- i = 0;
- do {
- ret = o2hb_do_disk_heartbeat(reg);
- } while (ret && ++i < 2);
+ ret = o2hb_do_disk_heartbeat(reg);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -881,7 +1132,8 @@ static int o2hb_thread(void *data)
after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
elapsed_msec);
- if (elapsed_msec < reg->hr_timeout_ms) {
+ if (!kthread_should_stop() &&
+ elapsed_msec < reg->hr_timeout_ms) {
/* the kthread api has blocked signals for us so no
* need to record the return value. */
msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -898,17 +1150,20 @@ static int o2hb_thread(void *data)
* to timeout on this region when we could just as easily
* write a clear generation - thus indicating to them that
* this node has left this region.
- *
- * XXX: Should we skip this on unclean_stop? */
- o2hb_prepare_block(reg, 0);
- ret = o2hb_issue_node_write(reg, &write_wc);
- if (ret == 0) {
- o2hb_wait_on_io(reg, &write_wc);
- } else {
- mlog_errno(ret);
+ */
+ if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+ o2hb_prepare_block(reg, 0);
+ ret = o2hb_issue_node_write(reg, &write_wc);
+ if (ret == 0)
+ o2hb_wait_on_io(reg, &write_wc);
+ else
+ mlog_errno(ret);
}
- mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+ /* Unpin node */
+ o2nm_undepend_this_node();
+
+ mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
return 0;
}
@@ -916,21 +1171,68 @@ static int o2hb_thread(void *data)
#ifdef CONFIG_DEBUG_FS
static int o2hb_debug_open(struct inode *inode, struct file *file)
{
+ struct o2hb_debug_buf *db = inode->i_private;
+ struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long lts;
char *buf = NULL;
int i = -1;
int out = 0;
+ /* max_nodes should be the largest bitmap we pass here */
+ BUG_ON(sizeof(map) < db->db_size);
+
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto bail;
- o2hb_fill_node_map(map, sizeof(map));
+ switch (db->db_type) {
+ case O2HB_DB_TYPE_LIVENODES:
+ case O2HB_DB_TYPE_LIVEREGIONS:
+ case O2HB_DB_TYPE_QUORUMREGIONS:
+ case O2HB_DB_TYPE_FAILEDREGIONS:
+ spin_lock(&o2hb_live_lock);
+ memcpy(map, db->db_data, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
- while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ case O2HB_DB_TYPE_REGION_LIVENODES:
+ spin_lock(&o2hb_live_lock);
+ reg = (struct o2hb_region *)db->db_data;
+ memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_NUMBER:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ reg->hr_region_num);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+ reg = (struct o2hb_region *)db->db_data;
+ lts = reg->hr_last_timeout_start;
+ /* If 0, it has never been set before */
+ if (lts)
+ lts = jiffies_to_msecs(jiffies - lts);
+ out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_PINNED:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ !!reg->hr_item_pinned);
+ goto done;
+
+ default:
+ goto done;
+ }
+
+ while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+done:
i_size_write(inode, out);
file->private_data = buf;
@@ -977,10 +1279,104 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- if (o2hb_debug_livenodes)
- debugfs_remove(o2hb_debug_livenodes);
- if (o2hb_debug_dir)
- debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
+ debugfs_remove(o2hb_debug_failedregions);
+ debugfs_remove(o2hb_debug_quorumregions);
+ debugfs_remove(o2hb_debug_liveregions);
+ debugfs_remove(o2hb_debug_livenodes);
+ debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len,
+ int type, int size, int len, void *data)
+{
+ *db = kmalloc(db_len, GFP_KERNEL);
+ if (!*db)
+ return NULL;
+
+ (*db)->db_type = type;
+ (*db)->db_size = size;
+ (*db)->db_len = len;
+ (*db)->db_data = data;
+
+ return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+ &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+ int ret = -ENOMEM;
+
+ o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+ if (!o2hb_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ o2hb_debug_dir,
+ &o2hb_db_livenodes,
+ sizeof(*o2hb_db_livenodes),
+ O2HB_DB_TYPE_LIVENODES,
+ sizeof(o2hb_live_node_bitmap),
+ O2NM_MAX_NODES,
+ o2hb_live_node_bitmap);
+ if (!o2hb_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_liveregions,
+ sizeof(*o2hb_db_liveregions),
+ O2HB_DB_TYPE_LIVEREGIONS,
+ sizeof(o2hb_live_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_live_region_bitmap);
+ if (!o2hb_debug_liveregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_quorumregions =
+ o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_quorumregions,
+ sizeof(*o2hb_db_quorumregions),
+ O2HB_DB_TYPE_QUORUMREGIONS,
+ sizeof(o2hb_quorum_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_quorum_region_bitmap);
+ if (!o2hb_debug_quorumregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_failedregions =
+ o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_failedregions,
+ sizeof(*o2hb_db_failedregions),
+ O2HB_DB_TYPE_FAILEDREGIONS,
+ sizeof(o2hb_failed_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_failed_region_bitmap);
+ if (!o2hb_debug_failedregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ if (ret)
+ o2hb_exit();
+
+ return ret;
}
int o2hb_init(void)
@@ -996,24 +1392,14 @@ int o2hb_init(void)
INIT_LIST_HEAD(&o2hb_node_events);
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+ memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+ memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+ memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+ memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
- o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
- if (!o2hb_debug_dir) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
-
- o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
- S_IFREG|S_IRUSR,
- o2hb_debug_dir, NULL,
- &o2hb_debug_fops);
- if (!o2hb_debug_livenodes) {
- mlog_errno(-ENOMEM);
- debugfs_remove(o2hb_debug_dir);
- return -ENOMEM;
- }
+ o2hb_dependent_users = 0;
- return 0;
+ return o2hb_debug_init();
}
/* if we're already in a callback then we're already serialized by the sem */
@@ -1059,8 +1445,9 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- if (reg->hr_tmp_block)
- kfree(reg->hr_tmp_block);
+ mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
+ kfree(reg->hr_tmp_block);
if (reg->hr_slot_data) {
for (i = 0; i < reg->hr_num_pages; i++) {
@@ -1074,8 +1461,15 @@ static void o2hb_region_release(struct config_item *item)
if (reg->hr_bdev)
blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- if (reg->hr_slots)
- kfree(reg->hr_slots);
+ kfree(reg->hr_slots);
+
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_livenodes);
+ debugfs_remove(reg->hr_debug_livenodes);
+ debugfs_remove(reg->hr_debug_regnum);
+ debugfs_remove(reg->hr_debug_elapsed_time);
+ debugfs_remove(reg->hr_debug_pinned);
+ debugfs_remove(reg->hr_debug_dir);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
@@ -1293,8 +1687,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
struct o2hb_disk_heartbeat_block *hb_block;
- mlog_entry_void();
-
ret = o2hb_read_slots(reg, reg->hr_blocks);
if (ret) {
mlog_errno(ret);
@@ -1316,7 +1708,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
}
out:
- mlog_exit(ret);
return ret;
}
@@ -1329,9 +1720,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
long fd;
int sectsize;
char *p = (char *)page;
- struct file *filp = NULL;
- struct inode *inode = NULL;
+ struct fd f;
+ struct inode *inode;
ssize_t ret = -EINVAL;
+ int live_threshold;
if (reg->hr_bdev)
goto out;
@@ -1348,26 +1740,26 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
if (fd < 0 || fd >= INT_MAX)
goto out;
- filp = fget(fd);
- if (filp == NULL)
+ f = fdget(fd);
+ if (f.file == NULL)
goto out;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out;
+ goto out2;
- inode = igrab(filp->f_mapping->host);
+ inode = igrab(f.file->f_mapping->host);
if (inode == NULL)
- goto out;
+ goto out2;
if (!S_ISBLK(inode->i_mode))
- goto out;
+ goto out3;
- reg->hr_bdev = I_BDEV(filp->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+ reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
+ ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
if (ret) {
reg->hr_bdev = NULL;
- goto out;
+ goto out3;
}
inode = NULL;
@@ -1379,7 +1771,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
"blocksize %u incorrect for device, expected %d",
reg->hr_block_bytes, sectsize);
ret = -EINVAL;
- goto out;
+ goto out3;
}
o2hb_init_region_params(reg);
@@ -1393,13 +1785,13 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = o2hb_map_slot_data(reg);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out3;
}
ret = o2hb_populate_slot_data(reg);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out3;
}
INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
@@ -1408,15 +1800,28 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
* A node is considered live after it has beat LIVE_THRESHOLD
* times. We're not steady until we've given them a chance
* _after_ our first read.
+ * The default threshold is bare minimum so as to limit the delay
+ * during mounts. For global heartbeat, the threshold doubled for the
+ * first region.
*/
- atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
+ live_threshold = O2HB_LIVE_THRESHOLD;
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+ live_threshold <<= 1;
+ spin_unlock(&o2hb_live_lock);
+ }
+ ++live_threshold;
+ atomic_set(&reg->hr_steady_iterations, live_threshold);
+ /* unsteady_iterations is double the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
if (IS_ERR(hb_task)) {
ret = PTR_ERR(hb_task);
mlog_errno(ret);
- goto out;
+ goto out3;
}
spin_lock(&o2hb_live_lock);
@@ -1426,20 +1831,20 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
- /* We got interrupted (hello ptrace!). Clean up */
- spin_lock(&o2hb_live_lock);
- hb_task = reg->hr_task;
- reg->hr_task = NULL;
- spin_unlock(&o2hb_live_lock);
+ atomic_set(&reg->hr_steady_iterations, 0);
+ reg->hr_aborted_start = 1;
+ }
- if (hb_task)
- kthread_stop(hb_task);
- goto out;
+ if (reg->hr_aborted_start) {
+ ret = -EIO;
+ goto out3;
}
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
+ if (o2hb_global_heartbeat_active())
+ set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
spin_unlock(&o2hb_live_lock);
if (hb_task)
@@ -1447,11 +1852,15 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
else
ret = -EIO;
+ if (hb_task && o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+ config_item_name(&reg->hr_item), reg->hr_dev_name);
+
+out3:
+ iput(inode);
+out2:
+ fdput(f);
out:
- if (filp)
- fput(filp);
- if (inode)
- iput(inode);
if (ret < 0) {
if (reg->hr_bdev) {
blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
@@ -1585,22 +1994,113 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+ int ret = -ENOMEM;
+
+ reg->hr_debug_dir =
+ debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+ if (!reg->hr_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_livenodes =
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ reg->hr_debug_dir,
+ &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap),
+ O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_regnum =
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+ reg->hr_debug_dir,
+ &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER,
+ 0, O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_regnum) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_elapsed_time =
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+ reg->hr_debug_dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+ 0, 0, reg);
+ if (!reg->hr_debug_elapsed_time) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_pinned =
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
+ reg->hr_debug_dir,
+ &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED,
+ 0, 0, reg);
+ if (!reg->hr_debug_pinned) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ return ret;
+}
+
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
const char *name)
{
struct o2hb_region *reg = NULL;
+ int ret;
reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
if (reg == NULL)
return ERR_PTR(-ENOMEM);
- config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) {
+ ret = -ENAMETOOLONG;
+ goto free;
+ }
spin_lock(&o2hb_live_lock);
+ reg->hr_region_num = 0;
+ if (o2hb_global_heartbeat_active()) {
+ reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+ O2NM_MAX_REGIONS);
+ if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+ spin_unlock(&o2hb_live_lock);
+ ret = -EFBIG;
+ goto free;
+ }
+ set_bit(reg->hr_region_num, o2hb_region_bitmap);
+ }
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
+ config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+ ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+ if (ret) {
+ config_item_put(&reg->hr_item);
+ goto free;
+ }
+
return &reg->hr_item;
+free:
+ kfree(reg);
+ return ERR_PTR(ret);
}
static void o2hb_heartbeat_group_drop_item(struct config_group *group,
@@ -1608,26 +2108,62 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
{
struct task_struct *hb_task;
struct o2hb_region *reg = to_o2hb_region(item);
+ int quorum_region = 0;
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
reg->hr_task = NULL;
+ reg->hr_item_dropped = 1;
spin_unlock(&o2hb_live_lock);
if (hb_task)
kthread_stop(hb_task);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ quorum_region = 1;
+ clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+ "stopped" : "start aborted"), config_item_name(item),
+ reg->hr_dev_name);
+ }
+
/*
* If we're racing a dev_write(), we need to wake them. They will
* check reg->hr_task
*/
if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ reg->hr_aborted_start = 1;
atomic_set(&reg->hr_steady_iterations, 0);
wake_up(&o2hb_steady_queue);
}
config_item_put(item);
+
+ if (!o2hb_global_heartbeat_active() || !quorum_region)
+ return;
+
+ /*
+ * If global heartbeat active and there are dependent users,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ spin_lock(&o2hb_live_lock);
+
+ if (!o2hb_dependent_users)
+ goto unlock;
+
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ o2hb_region_pin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
struct o2hb_heartbeat_group_attribute {
@@ -1687,6 +2223,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+ char *page)
+{
+ return sprintf(page, "%s\n",
+ o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+ const char *page, size_t count)
+{
+ unsigned int i;
+ int ret;
+ size_t len;
+
+ len = (page[count - 1] == '\n') ? count - 1 : count;
+ if (!len)
+ return -EINVAL;
+
+ for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+ if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ continue;
+
+ ret = o2hb_global_heartbeat_mode_set(i);
+ if (!ret)
+ printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+ o2hb_heartbeat_mode_desc[i]);
+ return count;
+ }
+
+ return -EINVAL;
+
+}
+
static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "dead_threshold",
@@ -1695,12 +2266,21 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
.store = o2hb_heartbeat_group_threshold_store,
};
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "mode",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2hb_heartbeat_group_mode_show,
+ .store = o2hb_heartbeat_group_mode_store,
+};
+
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
&o2hb_heartbeat_group_attr_threshold.attr,
+ &o2hb_heartbeat_group_attr_mode.attr,
NULL,
};
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
.show_attribute = o2hb_heartbeat_group_show,
.store_attribute = o2hb_heartbeat_group_store,
};
@@ -1712,7 +2292,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_hearbeat_group_item_ops,
+ .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -1744,7 +2324,7 @@ void o2hb_free_hb_set(struct config_group *group)
kfree(hs);
}
-/* hb callback registration and issueing */
+/* hb callback registration and issuing */
static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
{
@@ -1769,70 +2349,150 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
}
EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only pin the matching region. In global we pin all the active
+ * regions.
+ */
+static int o2hb_region_pin(const char *region_uuid)
{
- struct o2hb_region *p, *reg = NULL;
+ int ret = 0, found = 0;
+ struct o2hb_region *reg;
+ char *uuid;
assert_spin_locked(&o2hb_live_lock);
- list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
- if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
- reg = p;
- break;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ uuid = config_item_name(&reg->hr_item);
+
+ /* local heartbeat */
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
+
+ if (reg->hr_item_pinned || reg->hr_item_dropped)
+ goto skip_pin;
+
+ /* Ignore ENOENT only for local hb (userdlm domain) */
+ ret = o2nm_depend_item(&reg->hr_item);
+ if (!ret) {
+ mlog(ML_CLUSTER, "Pin region %s\n", uuid);
+ reg->hr_item_pinned = 1;
+ } else {
+ if (ret == -ENOENT && found)
+ ret = 0;
+ else {
+ mlog(ML_ERROR, "Pin region %s fails with %d\n",
+ uuid, ret);
+ break;
+ }
}
+skip_pin:
+ if (found)
+ break;
}
- return reg;
+ return ret;
}
-static int o2hb_region_get(const char *region_uuid)
+/*
+ * In local heartbeat mode, region_uuid passed matches the dlm domain name.
+ * In global heartbeat mode, region_uuid passed is NULL.
+ *
+ * In local, we only unpin the matching region. In global we unpin all the
+ * active regions.
+ */
+static void o2hb_region_unpin(const char *region_uuid)
{
- int ret = 0;
struct o2hb_region *reg;
+ char *uuid;
+ int found = 0;
- spin_lock(&o2hb_live_lock);
-
- reg = o2hb_find_region(region_uuid);
- if (!reg)
- ret = -ENOENT;
- spin_unlock(&o2hb_live_lock);
-
- if (ret)
- goto out;
+ assert_spin_locked(&o2hb_live_lock);
- ret = o2nm_depend_this_node();
- if (ret)
- goto out;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
- ret = o2nm_depend_item(&reg->hr_item);
- if (ret)
- o2nm_undepend_this_node();
+ uuid = config_item_name(&reg->hr_item);
+ if (region_uuid) {
+ if (strcmp(region_uuid, uuid))
+ continue;
+ found = 1;
+ }
-out:
- return ret;
+ if (reg->hr_item_pinned) {
+ mlog(ML_CLUSTER, "Unpin region %s\n", uuid);
+ o2nm_undepend_item(&reg->hr_item);
+ reg->hr_item_pinned = 0;
+ }
+ if (found)
+ break;
+ }
}
-static void o2hb_region_put(const char *region_uuid)
+static int o2hb_region_inc_user(const char *region_uuid)
{
- struct o2hb_region *reg;
+ int ret = 0;
spin_lock(&o2hb_live_lock);
- reg = o2hb_find_region(region_uuid);
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ ret = o2hb_region_pin(region_uuid);
+ goto unlock;
+ }
+ /*
+ * if global heartbeat active and this is the first dependent user,
+ * pin all regions if quorum region count <= CUT_OFF
+ */
+ o2hb_dependent_users++;
+ if (o2hb_dependent_users > 1)
+ goto unlock;
+
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
+ ret = o2hb_region_pin(NULL);
+
+unlock:
spin_unlock(&o2hb_live_lock);
+ return ret;
+}
- if (reg) {
- o2nm_undepend_item(&reg->hr_item);
- o2nm_undepend_this_node();
+void o2hb_region_dec_user(const char *region_uuid)
+{
+ spin_lock(&o2hb_live_lock);
+
+ /* local heartbeat */
+ if (!o2hb_global_heartbeat_active()) {
+ o2hb_region_unpin(region_uuid);
+ goto unlock;
}
+
+ /*
+ * if global heartbeat active and there are no dependent users,
+ * unpin all quorum regions
+ */
+ o2hb_dependent_users--;
+ if (!o2hb_dependent_users)
+ o2hb_region_unpin(NULL);
+
+unlock:
+ spin_unlock(&o2hb_live_lock);
}
int o2hb_register_callback(const char *region_uuid,
struct o2hb_callback_func *hc)
{
- struct o2hb_callback_func *tmp;
- struct list_head *iter;
+ struct o2hb_callback_func *f;
struct o2hb_callback *hbcall;
int ret;
@@ -1846,17 +2506,18 @@ int o2hb_register_callback(const char *region_uuid,
}
if (region_uuid) {
- ret = o2hb_region_get(region_uuid);
- if (ret)
+ ret = o2hb_region_inc_user(region_uuid);
+ if (ret) {
+ mlog_errno(ret);
goto out;
+ }
}
down_write(&o2hb_callback_sem);
- list_for_each(iter, &hbcall->list) {
- tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
- if (hc->hc_priority < tmp->hc_priority) {
- list_add_tail(&hc->hc_item, iter);
+ list_for_each_entry(f, &hbcall->list, hc_item) {
+ if (hc->hc_priority < f->hc_priority) {
+ list_add_tail(&hc->hc_item, &f->hc_item);
break;
}
}
@@ -1866,7 +2527,7 @@ int o2hb_register_callback(const char *region_uuid,
up_write(&o2hb_callback_sem);
ret = 0;
out:
- mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
ret, __builtin_return_address(0), hc);
return ret;
}
@@ -1877,7 +2538,7 @@ void o2hb_unregister_callback(const char *region_uuid,
{
BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
- mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
+ mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
__builtin_return_address(0), hc);
/* XXX Can this happen _with_ a region reference? */
@@ -1885,7 +2546,7 @@ void o2hb_unregister_callback(const char *region_uuid,
return;
if (region_uuid)
- o2hb_region_put(region_uuid);
+ o2hb_region_dec_user(region_uuid);
down_write(&o2hb_callback_sem);
@@ -1962,3 +2623,37 @@ void o2hb_stop_all_regions(void)
spin_unlock(&o2hb_live_lock);
}
EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+ struct o2hb_region *reg;
+ int numregs = 0;
+ char *p;
+
+ spin_lock(&o2hb_live_lock);
+
+ p = region_uuids;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ if (reg->hr_item_dropped)
+ continue;
+
+ mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+ if (numregs < max_regions) {
+ memcpy(p, config_item_name(&reg->hr_item),
+ O2HB_MAX_REGION_NAME_LEN);
+ p += O2HB_MAX_REGION_NAME_LEN;
+ }
+ numregs++;
+ }
+
+ spin_unlock(&o2hb_live_lock);
+
+ return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+ return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b4..00ad8e8fea5 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
#define O2HB_REGION_TIMEOUT_MS 2000
+#define O2HB_MAX_REGION_NAME_LEN 32
+
/* number of changes to be seen as live */
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7..07ac24fd925 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
EXPORT_SYMBOL_GPL(mlog_and_bits);
-struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
EXPORT_SYMBOL_GPL(mlog_not_bits);
static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -80,8 +80,6 @@ struct mlog_attribute {
}
static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
- define_mask(ENTRY),
- define_mask(EXIT),
define_mask(TCP),
define_mask(MSG),
define_mask(SOCKET),
@@ -93,26 +91,12 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(DLM_THREAD),
define_mask(DLM_MASTER),
define_mask(DLM_RECOVERY),
- define_mask(AIO),
- define_mask(JOURNAL),
- define_mask(DISK_ALLOC),
- define_mask(SUPER),
- define_mask(FILE_IO),
- define_mask(EXTENT_MAP),
define_mask(DLM_GLUE),
- define_mask(BH_IO),
- define_mask(UPTODATE),
- define_mask(NAMEI),
- define_mask(INODE),
define_mask(VOTE),
- define_mask(DCACHE),
define_mask(CONN),
define_mask(QUORUM),
- define_mask(EXPORT),
- define_mask(XATTR),
- define_mask(QUOTA),
- define_mask(REFCOUNT),
define_mask(BASTS),
+ define_mask(CLUSTER),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f..2260fb9e650 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -81,47 +81,31 @@
#include <linux/sched.h>
/* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update mlog.c! */
-#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
-#define ML_EXIT 0x0000000000000002ULL /* func call exit */
-#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
-#define ML_MSG 0x0000000000000008ULL /* net network messages */
-#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
-#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
-#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
-#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
-#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
-#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
-#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
-#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
-#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
-#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
-#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
-#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
-#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
-#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
-#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
-#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
-#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
-#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
-#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
-#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
-#define ML_CONN 0x0000000004000000ULL /* net connection management */
-#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
-#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
-#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
-#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
-#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */
+/* NOTE: If you add a flag, you need to also update masklog.c! */
+#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG 0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */
+#define ML_CONN 0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */
+
/* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
+#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
+#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
+#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
#ifndef MLOG_MASK_PREFIX
#define MLOG_MASK_PREFIX 0
#endif
@@ -215,62 +199,11 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#define mlog_errno(st) do { \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
- _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
+ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
+ _st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
} while (0)
-#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-#define mlog_entry(fmt, args...) do { \
- mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
-} while (0)
-
-#define mlog_entry_void() do { \
- mlog(ML_ENTRY, "ENTRY:\n"); \
-} while (0)
-
-/*
- * We disable this for sparse.
- */
-#if !defined(__CHECKER__)
-#define mlog_exit(st) do { \
- if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
- mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), signed long)) \
- mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
- || __builtin_types_compatible_p(typeof(st), unsigned short) \
- || __builtin_types_compatible_p(typeof(st), unsigned char)) \
- mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), signed int) \
- || __builtin_types_compatible_p(typeof(st), signed short) \
- || __builtin_types_compatible_p(typeof(st), signed char)) \
- mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
- else if (__builtin_types_compatible_p(typeof(st), long long)) \
- mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
- else \
- mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
-} while (0)
-#else
-#define mlog_exit(st) do { \
- mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
-} while (0)
-#endif
-
-#define mlog_exit_ptr(ptr) do { \
- mlog(ML_EXIT, "EXIT: %p\n", ptr); \
-} while (0)
-
-#define mlog_exit_void() do { \
- mlog(ML_EXIT, "EXIT\n"); \
-} while (0)
-#else
-#define mlog_entry(...) do { } while (0)
-#define mlog_entry_void(...) do { } while (0)
-#define mlog_exit(...) do { } while (0)
-#define mlog_exit_ptr(...) do { } while (0)
-#define mlog_exit_void(...) do { } while (0)
-#endif /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
-
#define mlog_bug_on_msg(cond, fmt, args...) do { \
if (cond) { \
mlog(ML_ERROR, "bug expression: " #cond "\n"); \
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index a3f150e52b0..73ba81928bc 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -46,10 +46,17 @@
#define O2NET_DEBUG_DIR "o2net"
#define SC_DEBUG_NAME "sock_containers"
#define NST_DEBUG_NAME "send_tracking"
+#define STATS_DEBUG_NAME "stats"
+#define NODES_DEBUG_NAME "connected_nodes"
+
+#define SHOW_SOCK_CONTAINERS 0
+#define SHOW_SOCK_STATS 1
static struct dentry *o2net_dentry;
static struct dentry *sc_dentry;
static struct dentry *nst_dentry;
+static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
static DEFINE_SPINLOCK(o2net_debug_lock);
@@ -123,37 +130,42 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int nst_seq_show(struct seq_file *seq, void *v)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+ ktime_t now;
+ s64 sock, send, status;
spin_lock(&o2net_debug_lock);
nst = next_nst(dummy_nst);
+ if (!nst)
+ goto out;
- if (nst != NULL) {
- /* get_task_comm isn't exported. oh well. */
- seq_printf(seq, "%p:\n"
- " pid: %lu\n"
- " tgid: %lu\n"
- " process name: %s\n"
- " node: %u\n"
- " sc: %p\n"
- " message id: %d\n"
- " message type: %u\n"
- " message key: 0x%08x\n"
- " sock acquiry: %lu.%ld\n"
- " send start: %lu.%ld\n"
- " wait start: %lu.%ld\n",
- nst, (unsigned long)nst->st_task->pid,
- (unsigned long)nst->st_task->tgid,
- nst->st_task->comm, nst->st_node,
- nst->st_sc, nst->st_id, nst->st_msg_type,
- nst->st_msg_key,
- nst->st_sock_time.tv_sec,
- (long)nst->st_sock_time.tv_usec,
- nst->st_send_time.tv_sec,
- (long)nst->st_send_time.tv_usec,
- nst->st_status_time.tv_sec,
- (long)nst->st_status_time.tv_usec);
- }
+ now = ktime_get();
+ sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
+ send = ktime_to_us(ktime_sub(now, nst->st_send_time));
+ status = ktime_to_us(ktime_sub(now, nst->st_status_time));
+
+ /* get_task_comm isn't exported. oh well. */
+ seq_printf(seq, "%p:\n"
+ " pid: %lu\n"
+ " tgid: %lu\n"
+ " process name: %s\n"
+ " node: %u\n"
+ " sc: %p\n"
+ " message id: %d\n"
+ " message type: %u\n"
+ " message key: 0x%08x\n"
+ " sock acquiry: %lld usecs ago\n"
+ " send start: %lld usecs ago\n"
+ " wait start: %lld usecs ago\n",
+ nst, (unsigned long)task_pid_nr(nst->st_task),
+ (unsigned long)nst->st_task->tgid,
+ nst->st_task->comm, nst->st_node,
+ nst->st_sc, nst->st_id, nst->st_msg_type,
+ nst->st_msg_key,
+ (long long)sock,
+ (long long)send,
+ (long long)status);
+out:
spin_unlock(&o2net_debug_lock);
return 0;
@@ -228,6 +240,11 @@ void o2net_debug_del_sc(struct o2net_sock_container *sc)
spin_unlock(&o2net_debug_lock);
}
+struct o2net_sock_debug {
+ int dbg_ctxt;
+ struct o2net_sock_container *dbg_sock;
+};
+
static struct o2net_sock_container
*next_sc(struct o2net_sock_container *sc_start)
{
@@ -253,7 +270,8 @@ static struct o2net_sock_container
static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
@@ -264,7 +282,8 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
@@ -276,65 +295,107 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return sc; /* unused, just needs to be null when done */
}
-#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
+#ifdef CONFIG_OCFS2_FS_STATS
+# define sc_send_count(_s) ((_s)->sc_send_count)
+# define sc_recv_count(_s) ((_s)->sc_recv_count)
+# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
+# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
+# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
+# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
+#else
+# define sc_send_count(_s) (0U)
+# define sc_recv_count(_s) (0U)
+# define sc_tv_acquiry_total_ns(_s) (0LL)
+# define sc_tv_send_total_ns(_s) (0LL)
+# define sc_tv_status_total_ns(_s) (0LL)
+# define sc_tv_process_total_ns(_s) (0LL)
+#endif
+
+/* So that debugfs.ocfs2 can determine which format is being used */
+#define O2NET_STATS_STR_VERSION 1
+static void sc_show_sock_stats(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ if (!sc)
+ return;
+
+ seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
+ sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
+ (long long)sc_tv_acquiry_total_ns(sc),
+ (long long)sc_tv_send_total_ns(sc),
+ (long long)sc_tv_status_total_ns(sc),
+ (unsigned long)sc_recv_count(sc),
+ (long long)sc_tv_process_total_ns(sc));
+}
+
+static void sc_show_sock_container(struct seq_file *seq,
+ struct o2net_sock_container *sc)
+{
+ struct inet_sock *inet = NULL;
+ __be32 saddr = 0, daddr = 0;
+ __be16 sport = 0, dport = 0;
+
+ if (!sc)
+ return;
+
+ if (sc->sc_sock) {
+ inet = inet_sk(sc->sc_sock->sk);
+ /* the stack's structs aren't sparse endian clean */
+ saddr = (__force __be32)inet->inet_saddr;
+ daddr = (__force __be32)inet->inet_daddr;
+ sport = (__force __be16)inet->inet_sport;
+ dport = (__force __be16)inet->inet_dport;
+ }
+
+ /* XXX sigh, inet-> doesn't have sparse annotation so any
+ * use of it here generates a warning with -Wbitwise */
+ seq_printf(seq, "%p:\n"
+ " krefs: %d\n"
+ " sock: %pI4:%u -> "
+ "%pI4:%u\n"
+ " remote node: %s\n"
+ " page off: %zu\n"
+ " handshake ok: %u\n"
+ " timer: %lld usecs\n"
+ " data ready: %lld usecs\n"
+ " advance start: %lld usecs\n"
+ " advance stop: %lld usecs\n"
+ " func start: %lld usecs\n"
+ " func stop: %lld usecs\n"
+ " func key: 0x%08x\n"
+ " func type: %u\n",
+ sc,
+ atomic_read(&sc->sc_kref.refcount),
+ &saddr, inet ? ntohs(sport) : 0,
+ &daddr, inet ? ntohs(dport) : 0,
+ sc->sc_node->nd_name,
+ sc->sc_page_off,
+ sc->sc_handshake_ok,
+ (long long)ktime_to_us(sc->sc_tv_timer),
+ (long long)ktime_to_us(sc->sc_tv_data_ready),
+ (long long)ktime_to_us(sc->sc_tv_advance_start),
+ (long long)ktime_to_us(sc->sc_tv_advance_stop),
+ (long long)ktime_to_us(sc->sc_tv_func_start),
+ (long long)ktime_to_us(sc->sc_tv_func_stop),
+ sc->sc_msg_key,
+ sc->sc_msg_type);
+}
static int sc_seq_show(struct seq_file *seq, void *v)
{
- struct o2net_sock_container *sc, *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
spin_lock(&o2net_debug_lock);
sc = next_sc(dummy_sc);
- if (sc != NULL) {
- struct inet_sock *inet = NULL;
-
- __be32 saddr = 0, daddr = 0;
- __be16 sport = 0, dport = 0;
-
- if (sc->sc_sock) {
- inet = inet_sk(sc->sc_sock->sk);
- /* the stack's structs aren't sparse endian clean */
- saddr = (__force __be32)inet->inet_saddr;
- daddr = (__force __be32)inet->inet_daddr;
- sport = (__force __be16)inet->inet_sport;
- dport = (__force __be16)inet->inet_dport;
- }
-
- /* XXX sigh, inet-> doesn't have sparse annotation so any
- * use of it here generates a warning with -Wbitwise */
- seq_printf(seq, "%p:\n"
- " krefs: %d\n"
- " sock: %pI4:%u -> "
- "%pI4:%u\n"
- " remote node: %s\n"
- " page off: %zu\n"
- " handshake ok: %u\n"
- " timer: %lu.%ld\n"
- " data ready: %lu.%ld\n"
- " advance start: %lu.%ld\n"
- " advance stop: %lu.%ld\n"
- " func start: %lu.%ld\n"
- " func stop: %lu.%ld\n"
- " func key: %u\n"
- " func type: %u\n",
- sc,
- atomic_read(&sc->sc_kref.refcount),
- &saddr, inet ? ntohs(sport) : 0,
- &daddr, inet ? ntohs(dport) : 0,
- sc->sc_node->nd_name,
- sc->sc_page_off,
- sc->sc_handshake_ok,
- TV_SEC_USEC(sc->sc_tv_timer),
- TV_SEC_USEC(sc->sc_tv_data_ready),
- TV_SEC_USEC(sc->sc_tv_advance_start),
- TV_SEC_USEC(sc->sc_tv_advance_stop),
- TV_SEC_USEC(sc->sc_tv_func_start),
- TV_SEC_USEC(sc->sc_tv_func_stop),
- sc->sc_msg_key,
- sc->sc_msg_type);
+ if (sc) {
+ if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
+ sc_show_sock_container(seq, sc);
+ else
+ sc_show_sock_stats(seq, sc);
}
-
spin_unlock(&o2net_debug_lock);
return 0;
@@ -351,7 +412,7 @@ static const struct seq_operations sc_seq_ops = {
.show = sc_seq_show,
};
-static int sc_fop_open(struct inode *inode, struct file *file)
+static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
{
struct o2net_sock_container *dummy_sc;
struct seq_file *seq;
@@ -369,7 +430,8 @@ static int sc_fop_open(struct inode *inode, struct file *file)
goto out;
seq = file->private_data;
- seq->private = dummy_sc;
+ seq->private = sd;
+ sd->dbg_sock = dummy_sc;
o2net_debug_add_sc(dummy_sc);
dummy_sc = NULL;
@@ -382,12 +444,48 @@ out:
static int sc_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- struct o2net_sock_container *dummy_sc = seq->private;
+ struct o2net_sock_debug *sd = seq->private;
+ struct o2net_sock_container *dummy_sc = sd->dbg_sock;
o2net_debug_del_sc(dummy_sc);
return seq_release_private(inode, file);
}
+static int stats_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_STATS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
+static const struct file_operations stats_seq_fops = {
+ .open = stats_fop_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sc_fop_release,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+ struct o2net_sock_debug *sd;
+
+ sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
+ if (sd == NULL)
+ return -ENOMEM;
+
+ sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
+ sd->dbg_sock = NULL;
+
+ return sc_common_open(file, sd);
+}
+
static const struct file_operations sc_seq_fops = {
.open = sc_fop_open,
.read = seq_read,
@@ -395,49 +493,87 @@ static const struct file_operations sc_seq_fops = {
.release = sc_fop_release,
};
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
{
- o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
- if (!o2net_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ int i = -1, out = 0;
- nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &nst_seq_fops);
- if (!nst_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ o2net_fill_node_map(map, sizeof(map));
- sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
- o2net_dentry, NULL,
- &sc_seq_fops);
- if (!sc_dentry) {
- mlog_errno(-ENOMEM);
- goto bail;
- }
+ while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+ out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+ return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+ char *buf;
+
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+ file->private_data = buf;
return 0;
-bail:
- if (sc_dentry)
- debugfs_remove(sc_dentry);
- if (nst_dentry)
- debugfs_remove(nst_dentry);
- if (o2net_dentry)
- debugfs_remove(o2net_dentry);
- return -ENOMEM;
}
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+ .open = nodes_fop_open,
+ .release = o2net_debug_release,
+ .read = o2net_debug_read,
+ .llseek = generic_file_llseek,
+};
+
void o2net_debugfs_exit(void)
{
- if (sc_dentry)
- debugfs_remove(sc_dentry);
- if (nst_dentry)
- debugfs_remove(nst_dentry);
+ debugfs_remove(nodes_dentry);
+ debugfs_remove(stats_dentry);
+ debugfs_remove(sc_dentry);
+ debugfs_remove(nst_dentry);
+ debugfs_remove(o2net_dentry);
+}
+
+int o2net_debugfs_init(void)
+{
+ umode_t mode = S_IFREG|S_IRUSR;
+
+ o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
if (o2net_dentry)
- debugfs_remove(o2net_dentry);
+ nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nst_seq_fops);
+ if (nst_dentry)
+ sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &sc_seq_fops);
+ if (sc_dentry)
+ stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &stats_seq_fops);
+ if (stats_dentry)
+ nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+ o2net_dentry, NULL, &nodes_fops);
+ if (nodes_dentry)
+ return 0;
+
+ o2net_debugfs_exit();
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef8..441c84e169e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/configfs.h>
@@ -28,7 +29,6 @@
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
-#include "ver.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
@@ -710,6 +710,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
+ mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
return &node->nd_item;
}
@@ -743,6 +745,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
}
write_unlock(&cluster->cl_nodes_lock);
+ mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+ config_item_name(&node->nd_item));
+
config_item_put(item);
}
@@ -939,8 +944,6 @@ static int __init init_o2nm(void)
{
int ret = -1;
- cluster_print_version();
-
ret = o2hb_init();
if (ret)
goto out;
@@ -978,6 +981,7 @@ out:
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad57..49b594325be 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
/* host name, group name, cluster name all 64 bytes */
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION** Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS 32
+
#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fc..1ec141e758d 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
* and if they're the last, they fire off the decision.
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/reboot.h>
@@ -90,7 +89,7 @@ static void o2quo_fence_self(void)
};
}
-/* Indicate that a timeout occured on a hearbeat region write. The
+/* Indicate that a timeout occurred on a hearbeat region write. The
* other nodes in the cluster may consider us dead at that time so we
* want to "fence" ourselves so that we don't scribble on the disk
* after they think they've recovered us. This can't solve all
@@ -262,10 +261,10 @@ void o2quo_hb_still_up(u8 node)
spin_unlock(&qs->qs_lock);
}
-/* This is analagous to hb_up. as a node's connection comes up we delay the
+/* This is analogous to hb_up. as a node's connection comes up we delay the
* quorum decision until we see it heartbeating. the hold will be droped in
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
* */
void o2quo_conn_up(u8 node)
{
@@ -326,5 +325,7 @@ void o2quo_init(void)
void o2quo_exit(void)
{
- flush_scheduled_work();
+ struct o2quo_state *qs = &o2quo_state;
+
+ flush_work(&qs->qs_work);
}
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index bc702dab5d1..b7f57271d49 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
static struct kobj_attribute attr_version =
- __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+ __ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
&attr_version.attr,
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
void o2cb_sys_shutdown(void)
{
mlog_sys_shutdown();
- sysfs_remove_link(NULL, "o2cb");
kset_unregister(o2cb_kset);
}
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
if (!o2cb_kset)
return -ENOMEM;
- /*
- * Create this symlink for backwards compatibility with old
- * versions of ocfs2-tools which look for things in /sys/o2cb.
- */
- ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
- if (ret)
- goto error;
-
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
if (ret)
goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c..681691bc233 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -59,6 +59,7 @@
#include <linux/idr.h>
#include <linux/kref.h>
#include <linux/net.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <asm/uaccess.h>
@@ -107,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
/* XXX someday we'll need better accounting */
-static struct socket *o2net_listen_sock = NULL;
+static struct socket *o2net_listen_sock;
/*
* listen work is only queued by the listening socket callbacks on the
@@ -136,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
static void o2net_sc_connect_completed(struct work_struct *work);
static void o2net_rx_until_empty(struct work_struct *work);
static void o2net_shutdown_sc(struct work_struct *work);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_listen_data_ready(struct sock *sk);
static void o2net_sc_send_keep_req(struct work_struct *work);
static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
@@ -153,74 +154,125 @@ static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
nst->st_node = node;
}
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_sock_time);
+ nst->st_sock_time = ktime_get();
}
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_send_time);
+ nst->st_send_time = ktime_get();
}
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
{
- do_gettimeofday(&nst->st_status_time);
+ nst->st_status_time = ktime_get();
}
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
{
nst->st_sc = sc;
}
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+ u32 msg_id)
{
nst->st_id = msg_id;
}
-#else /* CONFIG_DEBUG_FS */
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node)
+static inline void o2net_set_sock_timer(struct o2net_sock_container *sc)
{
+ sc->sc_tv_timer = ktime_get();
}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_data_ready_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_data_ready = ktime_get();
}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_start_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_advance_start = ktime_get();
}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static inline void o2net_set_advance_stop_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_advance_stop = ktime_get();
}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
+static inline void o2net_set_func_start_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_func_start = ktime_get();
}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
- u32 msg_id)
+static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
{
+ sc->sc_tv_func_stop = ktime_get();
}
+#else /* CONFIG_DEBUG_FS */
+# define o2net_init_nst(a, b, c, d, e)
+# define o2net_set_nst_sock_time(a)
+# define o2net_set_nst_send_time(a)
+# define o2net_set_nst_status_time(a)
+# define o2net_set_nst_sock_container(a, b)
+# define o2net_set_nst_msg_id(a, b)
+# define o2net_set_sock_timer(a)
+# define o2net_set_data_ready_time(a)
+# define o2net_set_advance_start_time(a)
+# define o2net_set_advance_stop_time(a)
+# define o2net_set_func_start_time(a)
+# define o2net_set_func_stop_time(a)
#endif /* CONFIG_DEBUG_FS */
-static inline int o2net_reconnect_delay(void)
+#ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+ return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+
+static void o2net_update_send_stats(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+ sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
+ ktime_sub(ktime_get(),
+ nst->st_status_time));
+ sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
+ ktime_sub(nst->st_status_time,
+ nst->st_send_time));
+ sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
+ ktime_sub(nst->st_send_time,
+ nst->st_sock_time));
+ sc->sc_send_count++;
+}
+
+static void o2net_update_recv_stats(struct o2net_sock_container *sc)
+{
+ sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
+ o2net_get_func_run_time(sc));
+ sc->sc_recv_count++;
+}
+
+#else
+
+# define o2net_update_send_stats(a, b)
+
+# define o2net_update_recv_stats(sc)
+
+#endif /* CONFIG_OCFS2_FS_STATS */
+
+static inline unsigned int o2net_reconnect_delay(void)
{
return o2nm_single_cluster->cl_reconnect_delay_ms;
}
-static inline int o2net_keepalive_delay(void)
+static inline unsigned int o2net_keepalive_delay(void)
{
return o2nm_single_cluster->cl_keepalive_delay_ms;
}
-static inline int o2net_idle_timeout(void)
+static inline unsigned int o2net_idle_timeout(void)
{
return o2nm_single_cluster->cl_idle_timeout_ms;
}
@@ -252,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn)
static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
{
- int ret = 0;
-
- do {
- if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
- ret = -EAGAIN;
- break;
- }
- spin_lock(&nn->nn_lock);
- ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
- if (ret == 0)
- list_add_tail(&nsw->ns_node_item,
- &nn->nn_status_list);
- spin_unlock(&nn->nn_lock);
- } while (ret == -EAGAIN);
+ int ret;
- if (ret == 0) {
- init_waitqueue_head(&nsw->ns_wq);
- nsw->ns_sys_status = O2NET_ERR_NONE;
- nsw->ns_status = 0;
+ spin_lock(&nn->nn_lock);
+ ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ nsw->ns_id = ret;
+ list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
}
+ spin_unlock(&nn->nn_lock);
+ if (ret < 0)
+ return ret;
- return ret;
+ init_waitqueue_head(&nsw->ns_wq);
+ nsw->ns_sys_status = O2NET_ERR_NONE;
+ nsw->ns_status = 0;
+ return 0;
}
static void o2net_complete_nsw_locked(struct o2net_node *nn,
@@ -355,10 +401,14 @@ static void sc_kref_release(struct kref *kref)
sc->sc_sock = NULL;
}
+ o2nm_undepend_item(&sc->sc_node->nd_item);
o2nm_node_put(sc->sc_node);
sc->sc_node = NULL;
o2net_debug_del_sc(sc);
+
+ if (sc->sc_page)
+ __free_page(sc->sc_page);
kfree(sc);
}
@@ -376,6 +426,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
{
struct o2net_sock_container *sc, *ret = NULL;
struct page *page = NULL;
+ int status = 0;
page = alloc_page(GFP_NOFS);
sc = kzalloc(sizeof(*sc), GFP_NOFS);
@@ -386,6 +437,13 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
o2nm_node_get(node);
sc->sc_node = node;
+ /* pin the node item of the remote node */
+ status = o2nm_depend_item(&node->nd_item);
+ if (status) {
+ mlog_errno(status);
+ o2nm_node_put(node);
+ goto out;
+ }
INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed);
INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty);
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
@@ -485,8 +543,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
if (was_valid && !valid) {
- printk(KERN_NOTICE "o2net: no longer connected to "
- SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+ if (old_sc)
+ printk(KERN_NOTICE "o2net: No longer connected to "
+ SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
o2net_complete_nodes_nsw(nn);
}
@@ -495,7 +554,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
cancel_delayed_work(&nn->nn_connect_expired);
printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
- "connected to" : "accepted connection from",
+ "Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
@@ -505,7 +564,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
* the work queue actually being up. */
if (!valid && o2net_wq) {
unsigned long delay;
- /* delay if we're withing a RECONNECT_DELAY of the
+ /* delay if we're within a RECONNECT_DELAY of the
* last attempt */
delay = (nn->nn_last_connect_attempt +
msecs_to_jiffies(o2net_reconnect_delay()))
@@ -538,15 +597,15 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
/* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
if (sk->sk_user_data) {
struct o2net_sock_container *sc = sk->sk_user_data;
sclog(sc, "data_ready hit\n");
- do_gettimeofday(&sc->sc_tv_data_ready);
+ o2net_set_data_ready_time(sc);
o2net_sc_queue_work(sc, &sc->sc_rx_work);
ready = sc->sc_data_ready;
} else {
@@ -554,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
}
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
/* see o2net_register_callbacks() */
@@ -575,16 +634,19 @@ static void o2net_state_change(struct sock *sk)
state_change = sc->sc_state_change;
switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- o2net_sc_queue_work(sc, &sc->sc_connect_work);
- break;
- default:
- o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
- break;
+ /* ignore connecting sockets as they make progress */
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ break;
+ case TCP_ESTABLISHED:
+ o2net_sc_queue_work(sc, &sc->sc_connect_work);
+ break;
+ default:
+ printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+ " shutdown, state %d\n",
+ SC_NODEF_ARGS(sc), sk->sk_state);
+ o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+ break;
}
out:
read_unlock(&sk->sk_callback_lock);
@@ -704,32 +766,32 @@ static struct o2net_msg_handler *
o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
struct rb_node **ret_parent)
{
- struct rb_node **p = &o2net_handler_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p = &o2net_handler_tree.rb_node;
+ struct rb_node *parent = NULL;
struct o2net_msg_handler *nmh, *ret = NULL;
int cmp;
- while (*p) {
- parent = *p;
- nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+ while (*p) {
+ parent = *p;
+ nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
cmp = o2net_handler_cmp(nmh, msg_type, key);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else {
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
ret = nmh;
- break;
+ break;
}
- }
+ }
- if (ret_p != NULL)
- *ret_p = p;
- if (ret_parent != NULL)
- *ret_parent = parent;
+ if (ret_p != NULL)
+ *ret_p = p;
+ if (ret_parent != NULL)
+ *ret_parent = parent;
- return ret;
+ return ret;
}
static void o2net_handler_kref_release(struct kref *kref)
@@ -806,7 +868,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
/* we've had some trouble with handlers seemingly vanishing. */
mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
&parent) == NULL,
- "couldn't find handler we *just* registerd "
+ "couldn't find handler we *just* registered "
"for type %u key %08x\n", msg_type, key);
}
write_unlock(&o2net_handler_lock);
@@ -854,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
- int ret;
- mm_segment_t oldfs;
- struct kvec vec = {
- .iov_len = len,
- .iov_base = data,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&vec,
- .msg_flags = MSG_DONTWAIT,
- };
-
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
- set_fs(oldfs);
-
- return ret;
+ struct kvec vec = { .iov_len = len, .iov_base = data, };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+ return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- mm_segment_t oldfs;
- struct msghdr msg = {
- .msg_iov = (struct iovec *)vec,
- .msg_iovlen = veclen,
- };
+ struct msghdr msg;
if (sock == NULL) {
ret = -EINVAL;
goto out;
}
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_sendmsg(sock, &msg, total);
- set_fs(oldfs);
- if (ret != total) {
- mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
- total);
- if (ret >= 0)
- ret = -EPIPE; /* should be smarter, I bet */
- goto out;
- }
-
- ret = 0;
+ ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+ if (likely(ret == total))
+ return 0;
+ mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+ if (ret >= 0)
+ ret = -EPIPE; /* should be smarter, I bet */
out:
- if (ret < 0)
- mlog(0, "returning error: %d\n", ret);
+ mlog(0, "returning error: %d\n", ret);
return ret;
}
@@ -971,10 +1006,29 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
return ret;
}
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct o2net_sock_container *sc;
+ int node, ret;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+ for (node = 0; node < O2NM_MAX_NODES; ++node) {
+ o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!ret) {
+ set_bit(node, map);
+ sc_put(sc);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
size_t caller_veclen, u8 target_node, int *status)
{
- int ret;
+ int ret = 0;
struct o2net_msg *msg = NULL;
size_t veclen, caller_bytes = 0;
struct kvec *vec = NULL;
@@ -1067,6 +1121,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
o2net_set_nst_status_time(&nst);
wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
+ o2net_update_send_stats(&nst, sc);
+
/* Note that we avoid overwriting the callers status return
* variable if a system error was reported on the other
* side. Callers beware. */
@@ -1080,10 +1136,8 @@ out:
o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
if (sc)
sc_put(sc);
- if (vec)
- kfree(vec);
- if (msg)
- kfree(msg);
+ kfree(vec);
+ kfree(msg);
o2net_complete_nsw(nn, &nsw, 0, 0, 0);
return ret;
}
@@ -1180,13 +1234,15 @@ static int o2net_process_message(struct o2net_sock_container *sc,
if (syserr != O2NET_ERR_NONE)
goto out_respond;
- do_gettimeofday(&sc->sc_tv_func_start);
+ o2net_set_func_start_time(sc);
sc->sc_msg_key = be32_to_cpu(hdr->key);
sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
be16_to_cpu(hdr->data_len),
nmh->nh_func_data, &ret_data);
- do_gettimeofday(&sc->sc_tv_func_stop);
+ o2net_set_func_stop_time(sc);
+
+ o2net_update_recv_stats(sc);
out_respond:
/* this destroys the hdr, so don't use it after this */
@@ -1217,11 +1273,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
- mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
- "version %llu but %llu is required, disconnecting\n",
- SC_NODEF_ARGS(sc),
- (unsigned long long)be64_to_cpu(hand->protocol_version),
- O2NET_PROTOCOL_VERSION);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+ "protocol version %llu but %llu is required. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ (unsigned long long)be64_to_cpu(hand->protocol_version),
+ O2NET_PROTOCOL_VERSION);
/* don't bother reconnecting if its the wrong version. */
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1235,33 +1291,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
*/
if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
o2net_idle_timeout()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_idle_timeout_ms),
- o2net_idle_timeout());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+ "idle timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_idle_timeout_ms),
+ o2net_idle_timeout());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
o2net_keepalive_delay()) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2net_keepalive_delay_ms),
- o2net_keepalive_delay());
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+ "delay of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2net_keepalive_delay_ms),
+ o2net_keepalive_delay());
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
O2HB_MAX_WRITE_TIMEOUT_MS) {
- mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
- "%u ms, but we use %u ms locally. disconnecting\n",
- SC_NODEF_ARGS(sc),
- be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
- O2HB_MAX_WRITE_TIMEOUT_MS);
+ printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+ "timeout of %u ms, but we use %u ms locally. "
+ "Disconnecting.\n", SC_NODEF_ARGS(sc),
+ be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+ O2HB_MAX_WRITE_TIMEOUT_MS);
o2net_ensure_shutdown(nn, sc, -ENOTCONN);
return -1;
}
@@ -1297,7 +1353,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
size_t datalen;
sclog(sc, "receiving\n");
- do_gettimeofday(&sc->sc_tv_advance_start);
+ o2net_set_advance_start_time(sc);
if (unlikely(sc->sc_handshake_ok == 0)) {
if(sc->sc_page_off < sizeof(struct o2net_handshake)) {
@@ -1372,7 +1428,7 @@ static int o2net_advance_rx(struct o2net_sock_container *sc)
out:
sclog(sc, "ret = %d\n", ret);
- do_gettimeofday(&sc->sc_tv_advance_stop);
+ o2net_set_advance_stop_time(sc);
return ret;
}
@@ -1472,27 +1528,16 @@ static void o2net_idle_timer(unsigned long data)
{
struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
- struct timeval now;
-
- do_gettimeofday(&now);
+#ifdef CONFIG_DEBUG_FS
+ unsigned long msecs = ktime_to_ms(ktime_get()) -
+ ktime_to_ms(sc->sc_tv_timer);
+#else
+ unsigned long msecs = o2net_idle_timeout();
+#endif
- printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
- "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
- o2net_idle_timeout() / 1000,
- o2net_idle_timeout() % 1000);
- mlog(ML_NOTICE, "here are some times that might help debug the "
- "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
- "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
- sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
- now.tv_sec, (long) now.tv_usec,
- sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
- sc->sc_tv_advance_start.tv_sec,
- (long) sc->sc_tv_advance_start.tv_usec,
- sc->sc_tv_advance_stop.tv_sec,
- (long) sc->sc_tv_advance_stop.tv_usec,
- sc->sc_msg_key, sc->sc_msg_type,
- sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
- sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
+ printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+ "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+ msecs / 1000, msecs % 1000);
/*
* Initialize the nn_timeout so that the next connection attempt
@@ -1508,7 +1553,7 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
msecs_to_jiffies(o2net_keepalive_delay()));
- do_gettimeofday(&sc->sc_tv_timer);
+ o2net_set_sock_timer(sc);
mod_timer(&sc->sc_idle_timeout,
jiffies + msecs_to_jiffies(o2net_idle_timeout()));
}
@@ -1624,13 +1669,12 @@ static void o2net_start_connect(struct work_struct *work)
ret = 0;
out:
- if (ret) {
- mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
- "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+ if (ret && sc) {
+ printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+ " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
/* 0 err so that another will be queued and attempted
* from set_nn_state */
- if (sc)
- o2net_ensure_shutdown(nn, sc, 0);
+ o2net_ensure_shutdown(nn, sc, 0);
}
if (sc)
sc_put(sc);
@@ -1649,8 +1693,8 @@ static void o2net_connect_expired(struct work_struct *work)
spin_lock(&nn->nn_lock);
if (!nn->nn_sc_valid) {
- mlog(ML_ERROR, "no connection established with node %u after "
- "%u.%u seconds, giving up and returning errors.\n",
+ printk(KERN_NOTICE "o2net: No connection established with "
+ "node %u after %u.%u seconds, giving up.\n",
o2net_num_from_nn(nn),
o2net_idle_timeout() / 1000,
o2net_idle_timeout() % 1000);
@@ -1693,6 +1737,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
{
o2quo_hb_down(node_num);
+ if (!node)
+ return;
+
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
@@ -1706,6 +1753,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
o2quo_hb_up(node_num);
+ BUG_ON(!node);
+
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
@@ -1750,16 +1799,18 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
struct socket *new_sock = NULL;
struct o2nm_node *node = NULL;
+ struct o2nm_node *local_node = NULL;
struct o2net_sock_container *sc = NULL;
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1771,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1787,17 +1839,25 @@ static int o2net_accept_one(struct socket *sock)
node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
if (node == NULL) {
- mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+ "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
- if (o2nm_this_node() > node->nd_num) {
- mlog(ML_NOTICE, "unexpected connect attempted from a lower "
- "numbered node '%s' at " "%pI4:%d with num %u\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port), node->nd_num);
+ if (o2nm_this_node() >= node->nd_num) {
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ if (local_node)
+ printk(KERN_NOTICE "o2net: Unexpected connect attempt "
+ "seen at node '%s' (%u, %pI4:%d) from "
+ "node '%s' (%u, %pI4:%d)\n",
+ local_node->nd_name, local_node->nd_num,
+ &(local_node->nd_ipv4_address),
+ ntohs(local_node->nd_ipv4_port),
+ node->nd_name,
+ node->nd_num, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
ret = -EINVAL;
goto out;
}
@@ -1822,10 +1882,10 @@ static int o2net_accept_one(struct socket *sock)
ret = 0;
spin_unlock(&nn->nn_lock);
if (ret) {
- mlog(ML_NOTICE, "attempt to connect from node '%s' at "
- "%pI4:%d but it already has an open connection\n",
- node->nd_name, &sin.sin_addr.s_addr,
- ntohs(sin.sin_port));
+ printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+ "at %pI4:%d but it already has an open connection\n",
+ node->nd_name, &sin.sin_addr.s_addr,
+ ntohs(sin.sin_port));
goto out;
}
@@ -1854,21 +1914,48 @@ out:
sock_release(new_sock);
if (node)
o2nm_node_put(node);
+ if (local_node)
+ o2nm_node_put(local_node);
if (sc)
sc_put(sc);
return ret;
}
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+ int err;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data;
@@ -1877,18 +1964,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
goto out;
}
- /* ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the acceptor has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket */
+ /* This callback may called twice when a new connection
+ * is being established as a child socket inherits everything
+ * from a parent LISTEN socket, including the data_ready cb of
+ * the parent. This leads to a hazard. In o2net_accept_one()
+ * we are still initializing the child socket but have not
+ * changed the inherited data_ready callback yet when
+ * data starts arriving.
+ * We avoid this hazard by checking the state.
+ * For the listening socket, the state will be TCP_LISTEN; for the new
+ * socket, will be TCP_ESTABLISHED. Also, in this case,
+ * sk->sk_user_data is not a valid function pointer.
+ */
+
if (sk->sk_state == TCP_LISTEN) {
- mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
+ } else {
+ ready = NULL;
}
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ if (ready != NULL)
+ ready(sk);
}
static int o2net_open_listening_sock(__be32 addr, __be16 port)
@@ -1903,7 +2001,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0) {
- mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+ printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
goto out;
}
@@ -1917,19 +2015,18 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
o2net_listen_sock = sock;
INIT_WORK(&o2net_listen_work, o2net_accept_many);
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0) {
- mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
- "ret=%d\n", &addr, ntohs(port), ret);
+ printk(KERN_ERR "o2net: Error %d while binding socket at "
+ "%pI4:%u\n", ret, &addr, ntohs(port));
goto out;
}
ret = sock->ops->listen(sock, 64);
- if (ret < 0) {
- mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
- &addr, ntohs(port), ret);
- }
+ if (ret < 0)
+ printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+ ret, &addr, ntohs(port));
out:
if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d..5bada2a69b5 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
struct list_head *unreg_list);
void o2net_unregister_handler_list(struct list_head *list);
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
struct o2nm_node;
int o2net_register_hb_callbacks(void);
void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 96fa7ebc530..dc024367110 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -129,7 +129,7 @@ struct o2net_node {
struct o2net_sock_container {
struct kref sc_kref;
- /* the next two are vaild for the life time of the sc */
+ /* the next two are valid for the life time of the sc */
struct socket *sc_sock;
struct o2nm_node *sc_node;
@@ -165,19 +165,28 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
- void (*sc_data_ready)(struct sock *sk, int bytes);
-#ifdef CONFIG_DEBUG_FS
- struct list_head sc_net_debug_item;
-#endif
- struct timeval sc_tv_timer;
- struct timeval sc_tv_data_ready;
- struct timeval sc_tv_advance_start;
- struct timeval sc_tv_advance_stop;
- struct timeval sc_tv_func_start;
- struct timeval sc_tv_func_stop;
+ void (*sc_data_ready)(struct sock *sk);
+
u32 sc_msg_key;
u16 sc_msg_type;
+#ifdef CONFIG_DEBUG_FS
+ struct list_head sc_net_debug_item;
+ ktime_t sc_tv_timer;
+ ktime_t sc_tv_data_ready;
+ ktime_t sc_tv_advance_start;
+ ktime_t sc_tv_advance_stop;
+ ktime_t sc_tv_func_start;
+ ktime_t sc_tv_func_stop;
+#endif
+#ifdef CONFIG_OCFS2_FS_STATS
+ ktime_t sc_tv_acquiry_total;
+ ktime_t sc_tv_send_total;
+ ktime_t sc_tv_status_total;
+ u32 sc_send_count;
+ u32 sc_recv_count;
+ ktime_t sc_tv_process_total;
+#endif
struct mutex sc_send_lock;
};
@@ -220,9 +229,9 @@ struct o2net_send_tracking {
u32 st_msg_type;
u32 st_msg_key;
u8 st_node;
- struct timeval st_sock_time;
- struct timeval st_send_time;
- struct timeval st_status_time;
+ ktime_t st_sock_time;
+ ktime_t st_send_time;
+ ktime_t st_status_time;
};
#else
struct o2net_send_tracking {
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad..00000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c..00000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe..e2e05a106be 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
#include <linux/slab.h>
#include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_DCACHE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -38,24 +37,48 @@
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
-#include "super.h"
+#include "ocfs2_trace.h"
+
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+ unsigned long gen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ BUG_ON(dentry->d_inode);
+ dentry->d_fsdata = (void *)gen;
+}
-static int ocfs2_dentry_revalidate(struct dentry *dentry,
- struct nameidata *nd)
+static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode;
int ret = 0; /* if all else fails, just return false */
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+ struct ocfs2_super *osb;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
- mlog_entry("(0x%p, '%.*s')\n", dentry,
- dentry->d_name.len, dentry->d_name.name);
+ inode = dentry->d_inode;
+ osb = OCFS2_SB(dentry->d_sb);
- /* Never trust a negative dentry - force a new lookup. */
+ trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+ dentry->d_name.name);
+
+ /* For a negative dentry -
+ * check the generation number of the parent and compare with the
+ * one stored in the inode.
+ */
if (inode == NULL) {
- mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
- dentry->d_name.name);
- goto bail;
+ unsigned long gen = (unsigned long) dentry->d_fsdata;
+ unsigned long pgen;
+ spin_lock(&dentry->d_lock);
+ pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ spin_unlock(&dentry->d_lock);
+ trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+ dentry->d_name.name,
+ pgen, gen);
+ if (gen != pgen)
+ goto bail;
+ goto valid;
}
BUG_ON(!osb);
@@ -67,8 +90,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
/* did we or someone else delete this inode? */
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
- mlog(0, "inode (%llu) deleted, returning false\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_dentry_revalidate_delete(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -78,10 +101,9 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
* inode nlink hits zero, it never goes back.
*/
if (inode->i_nlink == 0) {
- mlog(0, "Inode %llu orphaned, returning false "
- "dir = %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- S_ISDIR(inode->i_mode));
+ trace_ocfs2_dentry_revalidate_orphaned(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ S_ISDIR(inode->i_mode));
goto bail;
}
@@ -90,17 +112,16 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
* redo it.
*/
if (!dentry->d_fsdata) {
- mlog(0, "Inode %llu doesn't have dentry lock, "
- "returning false\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_dentry_revalidate_nofsdata(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto bail;
}
+valid:
ret = 1;
bail:
- mlog_exit(ret);
-
+ trace_ocfs2_dentry_revalidate_ret(ret);
return ret;
}
@@ -148,28 +169,24 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
u64 parent_blkno,
int skip_unhashed)
{
- struct list_head *p;
- struct dentry *dentry = NULL;
-
- spin_lock(&dcache_lock);
-
- list_for_each(p, &inode->i_dentry) {
- dentry = list_entry(p, struct dentry, d_alias);
+ struct dentry *dentry;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ spin_lock(&dentry->d_lock);
if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
- mlog(0, "dentry found: %.*s\n",
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_find_local_alias(dentry->d_name.len,
+ dentry->d_name.name);
- dget_locked(dentry);
- break;
+ dget_dlock(dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&inode->i_lock);
+ return dentry;
}
-
- dentry = NULL;
+ spin_unlock(&dentry->d_lock);
}
-
- spin_unlock(&dcache_lock);
-
- return dentry;
+ spin_unlock(&inode->i_lock);
+ return NULL;
}
DEFINE_SPINLOCK(dentry_attach_lock);
@@ -214,9 +231,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
struct dentry *alias;
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
- mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
- dentry->d_name.len, dentry->d_name.name,
- (unsigned long long)parent_blkno, dl);
+ trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)parent_blkno, dl);
/*
* Negative dentry. We ignore these for now.
@@ -227,6 +243,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
+ if (!dentry->d_inode && dentry->d_fsdata) {
+ /* Converting a negative dentry to positive
+ Clear dentry->d_fsdata */
+ dentry->d_fsdata = dl = NULL;
+ }
+
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -260,7 +282,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
(unsigned long long)parent_blkno,
(unsigned long long)dl->dl_parent_blkno);
- mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+ trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+ (unsigned long long)parent_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
goto out_attach;
}
@@ -321,52 +345,6 @@ out_attach:
return ret;
}
-DEFINE_SPINLOCK(dentry_list_lock);
-
-/* We limit the number of dentry locks to drop in one go. We have
- * this limit so that we don't starve other users of ocfs2_wq. */
-#define DL_INODE_DROP_COUNT 64
-
-/* Drop inode references from dentry locks */
-static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
-{
- struct ocfs2_dentry_lock *dl;
-
- spin_lock(&dentry_list_lock);
- while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
- dl = osb->dentry_lock_list;
- osb->dentry_lock_list = dl->dl_next;
- spin_unlock(&dentry_list_lock);
- iput(dl->dl_inode);
- kfree(dl);
- spin_lock(&dentry_list_lock);
- }
- spin_unlock(&dentry_list_lock);
-}
-
-void ocfs2_drop_dl_inodes(struct work_struct *work)
-{
- struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
- dentry_lock_work);
-
- __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
- /*
- * Don't queue dropping if umount is in progress. We flush the
- * list in ocfs2_dismount_volume
- */
- spin_lock(&dentry_list_lock);
- if (osb->dentry_lock_list &&
- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- spin_unlock(&dentry_list_lock);
-}
-
-/* Flush the whole work queue */
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
-{
- __ocfs2_drop_dl_inodes(osb, -1);
-}
-
/*
* ocfs2_dentry_iput() and friends.
*
@@ -391,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
+ iput(dl->dl_inode);
ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
ocfs2_lock_res_free(&dl->dl_lockres);
-
- /* We leave dropping of inode reference to ocfs2_wq as that can
- * possibly lead to inode deletion which gets tricky */
- spin_lock(&dentry_list_lock);
- if (!osb->dentry_lock_list &&
- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- dl->dl_next = osb->dentry_lock_list;
- osb->dentry_lock_list = dl;
- spin_unlock(&dentry_list_lock);
+ kfree(dl);
}
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
- int unlock;
+ int unlock = 0;
BUG_ON(dl->dl_count == 0);
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf..55f58892b15 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
extern const struct dentry_operations ocfs2_dentry_ops;
struct ocfs2_dentry_lock {
- /* Use count of dentry lock */
unsigned int dl_count;
- union {
- /* Linked list of dentry locks to release */
- struct ocfs2_dentry_lock *dl_next;
- u64 dl_parent_blkno;
- };
+ u64 dl_parent_blkno;
/*
* The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {
int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
u64 parent_blkno);
-extern spinlock_t dentry_list_lock;
-
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
-void ocfs2_drop_dl_inodes(struct work_struct *work);
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
-
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
@@ -64,5 +54,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir);
extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c8..0717662b4ae 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -43,7 +43,6 @@
#include <linux/quotaops.h>
#include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -61,13 +60,13 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static unsigned char ocfs2_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -322,21 +321,23 @@ static int ocfs2_check_dir_entry(struct inode * dir,
const char *error_msg = NULL;
const int rlen = le16_to_cpu(de->rec_len);
- if (rlen < OCFS2_DIR_REC_LEN(1))
+ if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
- else if (rlen % 4 != 0)
+ else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+ else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ else if (unlikely(
+ ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
error_msg = "directory entry across blocks";
- if (error_msg != NULL)
+ if (unlikely(error_msg != NULL))
mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
"offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
de->name_len);
+
return error_msg == NULL ? 1 : 0;
}
@@ -354,7 +355,7 @@ static inline int ocfs2_match(int len,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
struct inode *dir,
const char *name, int namelen,
unsigned long offset,
@@ -367,8 +368,6 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
int de_len;
int ret = 0;
- mlog_entry_void();
-
de_buf = first_de;
dlimit = de_buf + bytes;
@@ -402,7 +401,7 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
}
bail:
- mlog_exit(ret);
+ trace_ocfs2_search_dirblock(ret);
return ret;
}
@@ -447,8 +446,7 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
* We don't validate dirents here, that's handled
* in-place when the code walks them.
*/
- mlog(0, "Validating dirblock %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -706,8 +704,6 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
int num = 0;
int nblocks, i, err;
- mlog_entry_void();
-
sb = dir->i_sb;
nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -788,7 +784,7 @@ cleanup_and_exit:
for (; ra_ptr < ra_max; ra_ptr++)
brelse(bh_use[ra_ptr]);
- mlog_exit_ptr(ret);
+ trace_ocfs2_find_entry_el(ret);
return ret;
}
@@ -950,11 +946,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
goto out;
}
- mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
- "returns: %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- namelen, name, hinfo->major_hash, hinfo->minor_hash,
- (unsigned long long)phys);
+ trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ namelen, name, hinfo->major_hash,
+ hinfo->minor_hash, (unsigned long long)phys);
ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
if (ret) {
@@ -964,9 +958,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
- mlog(0, "leaf info: num_used: %d, count: %d\n",
- le16_to_cpu(dx_leaf->dl_list.de_num_used),
- le16_to_cpu(dx_leaf->dl_list.de_count));
+ trace_ocfs2_dx_dir_search_leaf_info(
+ le16_to_cpu(dx_leaf->dl_list.de_num_used),
+ le16_to_cpu(dx_leaf->dl_list.de_count));
entry_list = &dx_leaf->dl_list;
@@ -1166,8 +1160,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
int i, status = -ENOENT;
ocfs2_journal_access_func access = ocfs2_journal_access_db;
- mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
-
if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
access = ocfs2_journal_access_di;
@@ -1191,10 +1183,9 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
if (pde)
le16_add_cpu(&pde->rec_len,
le16_to_cpu(de->rec_len));
- else
- de->inode = 0;
+ de->inode = 0;
dir->i_version++;
- status = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
goto bail;
}
i += le16_to_cpu(de->rec_len);
@@ -1202,7 +1193,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
}
bail:
- mlog_exit(status);
return status;
}
@@ -1348,8 +1338,8 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
}
}
- mlog(0, "Dir %llu: delete entry at index: %d\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+ trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ index);
ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
@@ -1632,8 +1622,6 @@ int __ocfs2_add_entry(handle_t *handle,
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
- mlog_entry_void();
-
if (!namelen)
return -EINVAL;
@@ -1752,7 +1740,7 @@ int __ocfs2_add_entry(handle_t *handle,
ocfs2_recalc_free_list(dir, handle, lookup);
dir->i_version++;
- status = ocfs2_journal_dirty(handle, insert_bh);
+ ocfs2_journal_dirty(handle, insert_bh);
retval = 0;
goto bail;
}
@@ -1765,18 +1753,18 @@ int __ocfs2_add_entry(handle_t *handle,
* from ever getting here. */
retval = -ENOSPC;
bail:
+ if (retval)
+ mlog_errno(retval);
- mlog_exit(retval);
return retval;
}
static int ocfs2_dir_foreach_blk_id(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx)
{
- int ret, i, filldir_ret;
- unsigned long offset = *f_pos;
+ int ret, i;
+ unsigned long offset = ctx->pos;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
@@ -1792,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
di = (struct ocfs2_dinode *)di_bh->b_data;
data = &di->id2.i_data;
- while (*f_pos < i_size_read(inode)) {
-revalidate:
+ while (ctx->pos < i_size_read(inode)) {
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1813,50 +1800,31 @@ revalidate:
break;
i += le16_to_cpu(de->rec_len);
}
- *f_pos = offset = i;
+ ctx->pos = offset = i;
*f_version = inode->i_version;
}
- de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
- if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+ de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+ if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
/* On error, skip the f_pos to the end. */
- *f_pos = i_size_read(inode);
- goto out;
+ ctx->pos = i_size_read(inode);
+ break;
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- u64 version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- filldir_ret = filldir(priv, de->name,
- de->name_len,
- *f_pos,
- le64_to_cpu(de->inode),
- d_type);
- if (filldir_ret) {
- if (filldir_err)
- *filldir_err = filldir_ret;
- break;
- }
- if (version != *f_version)
- goto revalidate;
+ if (!dir_emit(ctx, de->name, de->name_len,
+ le64_to_cpu(de->inode), d_type))
+ goto out;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
-
out:
brelse(di_bh);
-
return 0;
}
@@ -1866,27 +1834,26 @@ out:
*/
static int ocfs2_dir_foreach_blk_el(struct inode *inode,
u64 *f_version,
- loff_t *f_pos, void *priv,
- filldir_t filldir, int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
- int error = 0;
unsigned long offset, blk, last_ra_blk = 0;
- int i, stored;
+ int i;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
+ int stored = 0;
- stored = 0;
bh = NULL;
- offset = (*f_pos) & (sb->s_blocksize - 1);
+ offset = ctx->pos & (sb->s_blocksize - 1);
- while (!error && !stored && *f_pos < i_size_read(inode)) {
- blk = (*f_pos) >> sb->s_blocksize_bits;
+ while (ctx->pos < i_size_read(inode)) {
+ blk = ctx->pos >> sb->s_blocksize_bits;
if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
/* Skip the corrupt dirblock and keep trying */
- *f_pos += sb->s_blocksize - offset;
+ ctx->pos += sb->s_blocksize - offset;
continue;
}
@@ -1908,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
ra_sectors = 8;
}
-revalidate:
/* If the dir block has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
@@ -1928,93 +1894,64 @@ revalidate:
i += le16_to_cpu(de->rec_len);
}
offset = i;
- *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+ ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
*f_version = inode->i_version;
}
- while (!error && *f_pos < i_size_read(inode)
+ while (ctx->pos < i_size_read(inode)
&& offset < sb->s_blocksize) {
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
/* On error, skip the f_pos to the
next block. */
- *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
brelse(bh);
- goto out;
+ continue;
}
- offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- /* We might block in the next section
- * if the data destination is
- * currently swapped out. So, use a
- * version stamp to detect whether or
- * not the directory has been modified
- * during the copy operation.
- */
- unsigned long version = *f_version;
unsigned char d_type = DT_UNKNOWN;
if (de->file_type < OCFS2_FT_MAX)
d_type = ocfs2_filetype_table[de->file_type];
- error = filldir(priv, de->name,
+ if (!dir_emit(ctx, de->name,
de->name_len,
- *f_pos,
le64_to_cpu(de->inode),
- d_type);
- if (error) {
- if (filldir_err)
- *filldir_err = error;
- break;
+ d_type)) {
+ brelse(bh);
+ return 0;
}
- if (version != *f_version)
- goto revalidate;
- stored ++;
+ stored++;
}
- *f_pos += le16_to_cpu(de->rec_len);
+ offset += le16_to_cpu(de->rec_len);
+ ctx->pos += le16_to_cpu(de->rec_len);
}
offset = 0;
brelse(bh);
bh = NULL;
+ if (!persist && stored)
+ break;
}
-
- stored = 0;
-out:
- return stored;
+ return 0;
}
static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
- loff_t *f_pos, void *priv, filldir_t filldir,
- int *filldir_err)
+ struct dir_context *ctx,
+ bool persist)
{
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
- filldir, filldir_err);
-
- return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
- filldir_err);
+ return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+ return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
}
/*
* This is intended to be called from inside other kernel functions,
* so we fake some arguments.
*/
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir)
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
{
- int ret = 0, filldir_err = 0;
u64 version = inode->i_version;
-
- while (*f_pos < i_size_read(inode)) {
- ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
- filldir, &filldir_err);
- if (ret || filldir_err)
- break;
- }
-
- if (ret > 0)
- ret = -EIO;
-
+ ocfs2_dir_foreach_blk(inode, &version, ctx, true);
return 0;
}
@@ -2022,16 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
* ocfs2_readdir()
*
*/
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
int lock_level = 0;
- mlog_entry("dirino=%llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
- error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
if (lock_level && error >= 0) {
/* We release EX lock which used to update atime
* and get PR lock again to reduce contention
@@ -2047,13 +1983,13 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
goto bail_nolock;
}
- error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
- dirent, filldir, NULL);
+ error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
+ if (error)
+ mlog_errno(error);
bail_nolock:
- mlog_exit(error);
return error;
}
@@ -2069,8 +2005,8 @@ int ocfs2_find_files_on_disk(const char *name,
{
int status = -ENOENT;
- mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_find_entry(name, namelen, inode, lookup);
if (status)
@@ -2114,8 +2050,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
int ret;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry("dir %llu, name '%.*s'\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+ trace_ocfs2_check_dir_for_entry(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
ret = -EEXIST;
if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2125,11 +2061,13 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
bail:
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
struct ocfs2_empty_dir_priv {
+ struct dir_context ctx;
unsigned seen_dot;
unsigned seen_dot_dot;
unsigned seen_other;
@@ -2214,10 +2152,9 @@ out:
int ocfs2_empty_dir(struct inode *inode)
{
int ret;
- loff_t start = 0;
- struct ocfs2_empty_dir_priv priv;
-
- memset(&priv, 0, sizeof(priv));
+ struct ocfs2_empty_dir_priv priv = {
+ .ctx.actor = ocfs2_empty_dir_filldir,
+ };
if (ocfs2_dir_indexed(inode)) {
ret = ocfs2_empty_dir_dx(inode, &priv);
@@ -2229,7 +2166,7 @@ int ocfs2_empty_dir(struct inode *inode)
*/
}
- ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+ ret = ocfs2_dir_foreach(inode, &priv.ctx);
if (ret)
mlog_errno(ret);
@@ -2297,15 +2234,10 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
}
ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
-
ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
i_size_write(inode, size);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
@@ -2329,8 +2261,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
struct buffer_head *new_bh = NULL;
struct ocfs2_dir_entry *de;
- mlog_entry_void();
-
if (ocfs2_new_dir_wants_trailer(inode))
size = ocfs2_dir_trailer_blk_off(parent->i_sb);
@@ -2366,14 +2296,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_init_dir_trailer(inode, new_bh, size);
}
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, new_bh);
i_size_write(inode, inode->i_sb->s_blocksize);
- inode->i_nlink = 2;
+ set_nlink(inode, 2);
inode->i_blocks = ocfs2_inode_sector_count(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -2389,7 +2315,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
bail:
brelse(new_bh);
- mlog_exit(status);
return status;
}
@@ -2404,27 +2329,27 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
int ret;
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
u16 dr_suballoc_bit;
- u64 dr_blkno;
+ u64 suballoc_loc, dr_blkno;
unsigned int num_bits;
struct buffer_head *dx_root_bh = NULL;
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dir_block_trailer *trailer =
ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
- &num_bits, &dr_blkno);
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
+ &dr_suballoc_bit, &num_bits, &dr_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
- mlog(0, "Dir %llu, attach new index block: %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)dr_blkno);
+ trace_ocfs2_dx_dir_attach_index(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)dr_blkno);
dx_root_bh = sb_getblk(osb->sb, dr_blkno);
if (dx_root_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
@@ -2440,6 +2365,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
memset(dx_root, 0, osb->sb->s_blocksize);
strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2384,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
dx_root->dr_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
}
-
- ret = ocfs2_journal_dirty(handle, dx_root_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, dx_root_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2472,12 +2395,12 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
di->i_dx_root = cpu_to_le64(dr_blkno);
+ spin_lock(&OCFS2_I(dir)->ip_lock);
OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
*ret_dx_root_bh = dx_root_bh;
dx_root_bh = NULL;
@@ -2499,7 +2422,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
for (i = 0; i < num_dx_leaves; i++) {
bh = sb_getblk(osb->sb, start_blk + i);
if (bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
dx_leaves[i] = bh;
@@ -2522,11 +2445,10 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
dx_leaf->dl_list.de_count =
cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
- mlog(0,
- "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)bh->b_blocknr,
- le16_to_cpu(dx_leaf->dl_list.de_count));
+ trace_ocfs2_dx_dir_format_cluster(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(dx_leaf->dl_list.de_count));
ocfs2_journal_dirty(handle, bh);
}
@@ -2558,7 +2480,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
* chance of contiguousness as the directory grows in number
* of entries.
*/
- ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+ ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2770,12 +2692,11 @@ static void ocfs2_dx_dir_index_root_block(struct inode *dir,
ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
- mlog(0,
- "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
- (unsigned long long)dir->i_ino, hinfo.major_hash,
- hinfo.minor_hash,
- le16_to_cpu(dx_root->dr_entries.de_num_used),
- de->name_len, de->name);
+ trace_ocfs2_dx_dir_index_root_block(
+ (unsigned long long)dir->i_ino,
+ hinfo.major_hash, hinfo.minor_hash,
+ de->name_len, de->name,
+ le16_to_cpu(dx_root->dr_entries.de_num_used));
ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
dirent_blk);
@@ -2892,7 +2813,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
bytes = blocks_wanted << sb->s_blocksize_bits;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(dir);
- struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
struct buffer_head *dirdata_bh = NULL;
struct buffer_head *dx_root_bh = NULL;
@@ -2991,7 +2912,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* if we only get one now, that's enough to continue. The rest
* will be claimed after the conversion to extents.
*/
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &oi->ip_la_data_resv;
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -3006,7 +2929,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
dirdata_bh = sb_getblk(sb, blkno);
if (!dirdata_bh) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out_commit;
}
@@ -3034,11 +2957,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_init_dir_trailer(dir, dirdata_bh, i);
}
- ret = ocfs2_journal_dirty(handle, dirdata_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, dirdata_bh);
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
/*
@@ -3086,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
di->i_size = cpu_to_le64(sb->s_blocksize);
di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
* This should never fail as our extent list is empty and all
@@ -3104,11 +3025,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
*/
dir->i_blocks = ocfs2_inode_sector_count(dir);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, di_bh);
if (ocfs2_supports_indexed_dirs(osb)) {
ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3055,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* pass. Claim the 2nd cluster as a separate extent.
*/
if (alloc > len) {
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+ ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
&len);
if (ret) {
mlog_errno(ret);
@@ -3244,7 +3161,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
*new_bh = sb_getblk(sb, p_blkno);
if (!*new_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -3252,7 +3169,6 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
bail:
if (did_quota && status < 0)
dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
- mlog_exit(status);
return status;
}
@@ -3287,8 +3203,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
struct ocfs2_extent_tree et;
struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
- mlog_entry_void();
-
if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
/*
* This would be a code error as an inline directory should
@@ -3337,8 +3251,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
down_write(&OCFS2_I(dir)->ip_alloc_sem);
drop_alloc_sem = 1;
dir_i_size = i_size_read(dir);
- mlog(0, "extending dir %llu (i_size = %lld)\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+ trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ dir_i_size);
/* dir->i_size is always block aligned. */
spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -3369,7 +3283,10 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
- credits = ocfs2_calc_extend_credits(sb, el, 1);
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+
+ credits = ocfs2_calc_extend_credits(sb, el);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -3423,11 +3340,8 @@ do_extend:
} else {
de->rec_len = cpu_to_le16(sb->s_blocksize);
}
- status = ocfs2_journal_dirty(handle, new_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
+ ocfs2_journal_dirty(handle, new_bh);
dir_i_size += dir->i_sb->s_blocksize;
i_size_write(dir, dir_i_size);
@@ -3454,7 +3368,6 @@ bail:
brelse(new_bh);
- mlog_exit(status);
return status;
}
@@ -3601,8 +3514,9 @@ next:
status = 0;
bail:
brelse(bh);
+ if (status)
+ mlog_errno(status);
- mlog_exit(status);
return status;
}
@@ -3805,7 +3719,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
{
int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
- credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+ credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
credits += ocfs2_quota_trans_credits(osb->sb);
return credits;
}
@@ -3833,9 +3747,9 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
struct ocfs2_dx_root_block *dx_root;
struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
- mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)leaf_blkno, insert_hash);
+ trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)leaf_blkno,
+ insert_hash);
ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
@@ -3906,11 +3820,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
dx_leaf_sort_swap);
- ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, dx_leaf_bh);
ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
&split_hash);
@@ -3919,8 +3829,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out_commit;
}
- mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
- leaf_cpos, split_hash, insert_hash);
+ trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
/*
* We have to carefully order operations here. There are items
@@ -3955,6 +3864,15 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
goto out_commit;
}
+ cpos = split_hash;
+ ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+ data_ac, meta_ac, new_dx_leaves,
+ num_dx_leaves);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
for (i = 0; i < num_dx_leaves; i++) {
ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
orig_dx_leaves[i],
@@ -3963,15 +3881,14 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
mlog_errno(ret);
goto out_commit;
}
- }
- cpos = split_hash;
- ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
- data_ac, meta_ac, new_dx_leaves,
- num_dx_leaves);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
+ ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
+ new_dx_leaves[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
}
ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
@@ -3982,6 +3899,7 @@ out_commit:
dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_commit_trans(osb, handle);
out:
@@ -4220,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
mlog_errno(ret);
did_quota = 0;
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, dx_root_bh);
out_commit:
@@ -4369,8 +4288,8 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
unsigned int blocks_wanted = 1;
struct buffer_head *bh = NULL;
- mlog(0, "getting ready to insert namelen %d into dir %llu\n",
- namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+ trace_ocfs2_prepare_dir_for_insert(
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
if (!namelen) {
ret = -EINVAL;
@@ -4482,15 +4401,21 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
goto out_commit;
}
+ spin_lock(&OCFS2_I(dir)->ip_lock);
OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+ spin_unlock(&OCFS2_I(dir)->ip_lock);
di->i_dx_root = cpu_to_le64(0ULL);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, di_bh);
blk = le64_to_cpu(dx_root->dr_blkno);
bit = le16_to_cpu(dx_root->dr_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (dx_root->dr_suballoc_loc)
+ bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
bit, bg_blkno, 1);
if (ret)
@@ -4551,8 +4476,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
- ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
- &dealloc);
+ ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
+ &dealloc, 0);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb64..f0344b75b14 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
struct ocfs2_dir_lookup_result *res);
int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
- filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
struct inode *dir,
struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index dcebf0d920f..bd1aab1f49a 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,7 +1,7 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
- dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa08..b46278f9ae4 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
@@ -89,21 +88,31 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
return 0;
}
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+ res = lock->lockres;
+
assert_spin_locked(&dlm->ast_lock);
+
if (!list_empty(&lock->ast_list)) {
- mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, lock %u:%llu, "
+ "AST list not empty, pending %d, newlevel %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ast_pending, lock->ml.type);
BUG();
}
if (lock->ast_pending)
- mlog(0, "lock has an ast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, AST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -111,9 +120,10 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
/* check to see if this ast obsoletes the bast */
if (dlm_should_cancel_bast(dlm, lock)) {
- struct dlm_lock_resource *res = lock->lockres;
- mlog(0, "%s: cancelling bast for %.*s\n",
- dlm->name, res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Cancelling BAST\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lock->bast_pending = 0;
list_del_init(&lock->bast_list);
lock->ml.highest_blocked = LKM_IVMODE;
@@ -135,8 +145,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -146,17 +154,23 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
}
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
+ struct dlm_lock_resource *res;
BUG_ON(!dlm);
BUG_ON(!lock);
+
assert_spin_locked(&dlm->ast_lock);
+ res = lock->lockres;
+
BUG_ON(!list_empty(&lock->bast_list));
if (lock->bast_pending)
- mlog(0, "lock has a bast getting flushed right now\n");
+ mlog(0, "%s: res %.*s, lock %u:%llu, BAST getting flushed\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
/* putting lock on list, add a ref */
dlm_lock_get(lock);
@@ -168,8 +182,6 @@ static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
- mlog_entry_void();
-
BUG_ON(!dlm);
BUG_ON(!lock);
@@ -185,9 +197,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
BUG_ON(!lksb);
/* only updates if this node masters the lockres */
+ spin_lock(&res->spinlock);
if (res->owner == dlm->node_num) {
-
- spin_lock(&res->spinlock);
/* check the lksb flags for the direction */
if (lksb->flags & DLM_LKSB_GET_LVB) {
mlog(0, "getting lvb from lockres for %s node\n",
@@ -202,8 +213,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* here. In the future we might want to clear it at the time
* the put is actually done.
*/
- spin_unlock(&res->spinlock);
}
+ spin_unlock(&res->spinlock);
/* reset any lvb flags on the lksb */
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
@@ -215,7 +226,10 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
dlm_astlockfunc_t *fn;
struct dlm_lockstatus *lksb;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
fn = lock->ast;
@@ -233,7 +247,10 @@ int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lockstatus *lksb;
int lksbflags;
- mlog_entry_void();
+ mlog(0, "%s: res %.*s, lock %u:%llu, Remote AST\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
lksb = lock->lksb;
BUG_ON(lock->ml.node == dlm->node_num);
@@ -252,9 +269,14 @@ void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
{
dlm_bastlockfunc_t *fn = lock->bast;
- mlog_entry_void();
BUG_ON(lock->ml.node != dlm->node_num);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Local BAST, blocked %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ blocked_type);
+
(*fn)(lock->astdata, blocked_type);
}
@@ -270,8 +292,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_lock *lock = NULL;
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
- struct list_head *iter, *head=NULL;
- u64 cookie;
+ struct list_head *head = NULL;
+ __be64 cookie;
u32 flags;
u8 node;
@@ -334,7 +356,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
/* cannot get a proxy ast message if this node owns it */
BUG_ON(res->owner == dlm->node_num);
- mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -350,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
/* try convert queue for both ast/bast */
head = &res->converting;
lock = NULL;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -362,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
else
head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, head, list) {
if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -384,8 +405,12 @@ do_ast:
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
- mlog(0, "ast: Adding to granted list... type=%d, "
- "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+ mlog(0, "%s: res %.*s, lock %u:%llu, Granted type %d => %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ lock->ml.type, lock->ml.convert_type);
+
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
@@ -428,9 +453,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
size_t veclen = 1;
int status;
- mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
- res->lockname.len, res->lockname.name, lock->ml.node,
- msg_type, blocked_type);
+ mlog(0, "%s: res %.*s, to %u, type %d, blocked_type %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, lock->ml.node, msg_type,
+ blocked_type);
memset(&past, 0, sizeof(struct dlm_proxy_ast));
past.node_idx = dlm->node_num;
@@ -443,7 +468,6 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
vec[0].iov_base = &past;
if (flags & DLM_LKSB_GET_LVB) {
- mlog(0, "returning requested LVB data\n");
be32_add_cpu(&past.flags, LKM_GET_LVB);
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
@@ -453,7 +477,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send AST to node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name, ret,
+ lock->ml.node);
else {
if (status == DLM_RECOVERING) {
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980..fae17c640df 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
#define DLM_THREAD_MS 200 // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT (1 << 17)
#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
# define DLM_HASH_PAGES 1
#else
@@ -50,10 +50,10 @@
#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
enum dlm_mle_type {
- DLM_MLE_BLOCK,
- DLM_MLE_MASTER,
- DLM_MLE_MIGRATION,
- DLM_MLE_NUM_TYPES
+ DLM_MLE_BLOCK = 0,
+ DLM_MLE_MASTER = 1,
+ DLM_MLE_MIGRATION = 2,
+ DLM_MLE_NUM_TYPES = 3,
};
struct dlm_master_list_entry {
@@ -82,8 +82,8 @@ struct dlm_master_list_entry {
enum dlm_ast_type {
DLM_AST = 0,
- DLM_BAST,
- DLM_ASTUNLOCK
+ DLM_BAST = 1,
+ DLM_ASTUNLOCK = 2,
};
@@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
struct dlm_recovery_ctxt
{
struct list_head resources;
- struct list_head received;
struct list_head node_data;
u8 new_master;
u8 dead_node;
@@ -119,9 +118,9 @@ struct dlm_recovery_ctxt
enum dlm_ctxt_state {
DLM_CTXT_NEW = 0,
- DLM_CTXT_JOINED,
- DLM_CTXT_IN_SHUTDOWN,
- DLM_CTXT_LEAVING,
+ DLM_CTXT_JOINED = 1,
+ DLM_CTXT_IN_SHUTDOWN = 2,
+ DLM_CTXT_LEAVING = 3,
};
struct dlm_ctxt
@@ -144,6 +143,7 @@ struct dlm_ctxt
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct dlm_recovery_ctxt reco;
spinlock_t master_lock;
@@ -331,6 +331,7 @@ struct dlm_lock_resource
u16 state;
char lvb[DLM_LVB_LEN];
unsigned int inflight_locks;
+ unsigned int inflight_assert_workers;
unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
@@ -388,8 +389,8 @@ struct dlm_lock
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
- DLM_CONVERTING_LIST,
- DLM_BLOCKED_LIST
+ DLM_CONVERTING_LIST = 1,
+ DLM_BLOCKED_LIST = 2,
};
static inline int dlm_lvb_is_empty(char *lvb)
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
return 1;
}
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+ if (idx == DLM_GRANTED_LIST)
+ return "granted";
+ else if (idx == DLM_CONVERTING_LIST)
+ return "converting";
+ else if (idx == DLM_BLOCKED_LIST)
+ return "blocked";
+ else
+ return "unknown";
+}
+
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
@@ -427,25 +440,28 @@ struct dlm_node_iter
enum {
- DLM_MASTER_REQUEST_MSG = 500,
- DLM_UNUSED_MSG1, /* 501 */
- DLM_ASSERT_MASTER_MSG, /* 502 */
- DLM_CREATE_LOCK_MSG, /* 503 */
- DLM_CONVERT_LOCK_MSG, /* 504 */
- DLM_PROXY_AST_MSG, /* 505 */
- DLM_UNLOCK_LOCK_MSG, /* 506 */
- DLM_DEREF_LOCKRES_MSG, /* 507 */
- DLM_MIGRATE_REQUEST_MSG, /* 508 */
- DLM_MIG_LOCKRES_MSG, /* 509 */
- DLM_QUERY_JOIN_MSG, /* 510 */
- DLM_ASSERT_JOINED_MSG, /* 511 */
- DLM_CANCEL_JOIN_MSG, /* 512 */
- DLM_EXIT_DOMAIN_MSG, /* 513 */
- DLM_MASTER_REQUERY_MSG, /* 514 */
- DLM_LOCK_REQUEST_MSG, /* 515 */
- DLM_RECO_DATA_DONE_MSG, /* 516 */
- DLM_BEGIN_RECO_MSG, /* 517 */
- DLM_FINALIZE_RECO_MSG /* 518 */
+ DLM_MASTER_REQUEST_MSG = 500,
+ DLM_UNUSED_MSG1 = 501,
+ DLM_ASSERT_MASTER_MSG = 502,
+ DLM_CREATE_LOCK_MSG = 503,
+ DLM_CONVERT_LOCK_MSG = 504,
+ DLM_PROXY_AST_MSG = 505,
+ DLM_UNLOCK_LOCK_MSG = 506,
+ DLM_DEREF_LOCKRES_MSG = 507,
+ DLM_MIGRATE_REQUEST_MSG = 508,
+ DLM_MIG_LOCKRES_MSG = 509,
+ DLM_QUERY_JOIN_MSG = 510,
+ DLM_ASSERT_JOINED_MSG = 511,
+ DLM_CANCEL_JOIN_MSG = 512,
+ DLM_EXIT_DOMAIN_MSG = 513,
+ DLM_MASTER_REQUERY_MSG = 514,
+ DLM_LOCK_REQUEST_MSG = 515,
+ DLM_RECO_DATA_DONE_MSG = 516,
+ DLM_BEGIN_RECO_MSG = 517,
+ DLM_FINALIZE_RECO_MSG = 518,
+ DLM_QUERY_REGION = 519,
+ DLM_QUERY_NODEINFO = 520,
+ DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
};
struct dlm_reco_node_data
@@ -458,19 +474,19 @@ struct dlm_reco_node_data
enum {
DLM_RECO_NODE_DATA_DEAD = -1,
DLM_RECO_NODE_DATA_INIT = 0,
- DLM_RECO_NODE_DATA_REQUESTING,
- DLM_RECO_NODE_DATA_REQUESTED,
- DLM_RECO_NODE_DATA_RECEIVING,
- DLM_RECO_NODE_DATA_DONE,
- DLM_RECO_NODE_DATA_FINALIZE_SENT,
+ DLM_RECO_NODE_DATA_REQUESTING = 1,
+ DLM_RECO_NODE_DATA_REQUESTED = 2,
+ DLM_RECO_NODE_DATA_RECEIVING = 3,
+ DLM_RECO_NODE_DATA_DONE = 4,
+ DLM_RECO_NODE_DATA_FINALIZE_SENT = 5,
};
enum {
DLM_MASTER_RESP_NO = 0,
- DLM_MASTER_RESP_YES,
- DLM_MASTER_RESP_MAYBE,
- DLM_MASTER_RESP_ERROR
+ DLM_MASTER_RESP_YES = 1,
+ DLM_MASTER_RESP_MAYBE = 2,
+ DLM_MASTER_RESP_ERROR = 3,
};
@@ -647,9 +663,9 @@ struct dlm_proxy_ast
#define DLM_MOD_KEY (0x666c6172)
enum dlm_query_join_response_code {
JOIN_DISALLOW = 0,
- JOIN_OK,
- JOIN_OK_NO_MAP,
- JOIN_PROTOCOL_MISMATCH,
+ JOIN_OK = 1,
+ JOIN_OK_NO_MAP = 2,
+ JOIN_PROTOCOL_MISMATCH = 3,
};
struct dlm_query_join_packet {
@@ -663,7 +679,7 @@ struct dlm_query_join_packet {
};
union dlm_query_join_response {
- u32 intval;
+ __be32 intval;
struct dlm_query_join_packet packet;
};
@@ -727,6 +743,31 @@ struct dlm_cancel_join
u8 domain[O2NM_MAX_NAME_LEN];
};
+struct dlm_query_region {
+ u8 qr_node;
+ u8 qr_numregions;
+ u8 qr_namelen;
+ u8 pad1;
+ u8 qr_domain[O2NM_MAX_NAME_LEN];
+ u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+ u8 ni_nodenum;
+ u8 pad1;
+ __be16 ni_ipv4_port;
+ __be32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+ u8 qn_nodenum;
+ u8 qn_numnodes;
+ u8 qn_namelen;
+ u8 pad1;
+ u8 qn_domain[O2NM_MAX_NAME_LEN];
+ struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
struct dlm_exit_domain
{
u8 node_idx;
@@ -818,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -836,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
kref_get(&res->refs);
}
void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
const char *name,
unsigned int len,
@@ -861,49 +901,23 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen);
-#define dlm_lockres_set_refmap_bit(bit,res) \
- __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
-#define dlm_lockres_clear_refmap_bit(bit,res) \
- __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit);
-static inline void __dlm_lockres_set_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: setting bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- set_bit(bit, res->refmap);
-}
-
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
-{
- //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
- // res->lockname.len, res->lockname.name, bit);
- clear_bit(bit, res->refmap);
-}
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line);
-#define dlm_lockres_drop_inflight_ref(d,r) \
- __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r) \
- __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_do_local_ast(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock);
@@ -1028,6 +1042,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_clean_master_list(struct dlm_ctxt *dlm,
u8 dead_node);
+void dlm_force_free_mles(struct dlm_ctxt *dlm);
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
int __dlm_lockres_has_locks(struct dlm_lock_resource *res);
int __dlm_lockres_unused(struct dlm_lock_resource *res);
@@ -1067,11 +1082,9 @@ static inline int dlm_lock_compatible(int existing, int request)
static inline int dlm_lock_on_list(struct list_head *head,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, head) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, head, list) {
if (tmplock == lock)
return 1;
}
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b..e36d63ff178 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
@@ -124,13 +123,12 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
int *kick_thread)
{
enum dlm_status status = DLM_NORMAL;
- struct list_head *iter;
struct dlm_lock *tmplock=NULL;
assert_spin_locked(&res->spinlock);
- mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
- lock->ml.type, lock->ml.convert_type, type);
+ mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+ lock->ml.type, lock->ml.convert_type, type);
spin_lock(&lock->spinlock);
@@ -186,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
/* upconvert from here on */
status = DLM_NORMAL;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->granted, list) {
if (tmplock == lock)
continue;
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
/* existing conversion requests take precedence */
@@ -354,7 +350,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct kvec vec[2];
size_t veclen = 1;
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
memset(&convert, 0, sizeof(struct dlm_convert_lock));
convert.node_idx = dlm->node_num;
@@ -391,7 +387,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
dlm_error(ret);
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+ res->owner);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
@@ -423,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
+ struct dlm_lock *tmp_lock;
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
@@ -470,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_error(status);
goto leave;
}
- list_for_each(iter, &res->granted) {
- lock = list_entry(iter, struct dlm_lock, list);
- if (lock->ml.cookie == cnv->cookie &&
- lock->ml.node == cnv->node_idx) {
+ list_for_each_entry(tmp_lock, &res->granted, list) {
+ if (tmp_lock->ml.cookie == cnv->cookie &&
+ tmp_lock->ml.node == cnv->node_idx) {
+ lock = tmp_lock;
dlm_lock_get(lock);
break;
}
- lock = NULL;
}
spin_unlock(&res->spinlock);
if (!lock) {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0cd24cf5439..18f13c2e4a1 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -30,6 +30,7 @@
#include <linux/sysctl.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
+#include <linux/export.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
@@ -95,7 +96,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
{
- struct list_head *iter2;
struct dlm_lock *lock;
char buf[DLM_LOCKID_NAME_MAX];
@@ -117,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
res->inflight_locks, atomic_read(&res->asts_reserved));
dlm_print_lockres_refmap(res);
printk(" granted queue:\n");
- list_for_each(iter2, &res->granted) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
__dlm_print_lock(lock);
}
printk(" converting queue:\n");
- list_for_each(iter2, &res->converting) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
__dlm_print_lock(lock);
}
printk(" blocked queue:\n");
- list_for_each(iter2, &res->blocked) {
- lock = list_entry(iter2, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->blocked, list) {
__dlm_print_lock(lock);
}
}
@@ -341,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
#ifdef CONFIG_DEBUG_FS
-static struct dentry *dlm_debugfs_root = NULL;
+static struct dentry *dlm_debugfs_root;
#define DLM_DEBUGFS_DIR "o2dlm"
#define DLM_DEBUGFS_DLM_STATE "dlm_state"
@@ -370,92 +367,46 @@ static void dlm_debug_get(struct dlm_debug_ctxt *dc)
kref_get(&dc->debug_refcnt);
}
-static struct debug_buffer *debug_buffer_allocate(void)
+static int debug_release(struct inode *inode, struct file *file)
{
- struct debug_buffer *db = NULL;
-
- db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
- if (!db)
- goto bail;
-
- db->len = PAGE_SIZE;
- db->buf = kmalloc(db->len, GFP_KERNEL);
- if (!db->buf)
- goto bail;
-
- return db;
-bail:
- kfree(db);
- return NULL;
-}
-
-static ssize_t debug_buffer_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
-{
- struct debug_buffer *db = file->private_data;
-
- return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
-}
-
-static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
-{
- struct debug_buffer *db = file->private_data;
- loff_t new = -1;
-
- switch (whence) {
- case 0:
- new = off;
- break;
- case 1:
- new = file->f_pos + off;
- break;
- }
-
- if (new < 0 || new > db->len)
- return -EINVAL;
-
- return (file->f_pos = new);
+ free_page((unsigned long)file->private_data);
+ return 0;
}
-static int debug_buffer_release(struct inode *inode, struct file *file)
+static ssize_t debug_read(struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- struct debug_buffer *db = (struct debug_buffer *)file->private_data;
-
- if (db)
- kfree(db->buf);
- kfree(db);
-
- return 0;
+ return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+ i_size_read(file->f_mapping->host));
}
/* end - util funcs */
/* begin - purge list funcs */
-static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_lock_resource *res;
int out = 0;
unsigned long total = 0;
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dumping Purgelist for Domain: %s\n", dlm->name);
spin_lock(&dlm->spinlock);
list_for_each_entry(res, &dlm->purge_list, purge) {
++total;
- if (db->len - out < 100)
+ if (len - out < 100)
continue;
spin_lock(&res->spinlock);
out += stringify_lockname(res->lockname.name,
res->lockname.len,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\t%ld\n",
(jiffies - res->last_used)/HZ);
spin_unlock(&res->spinlock);
}
spin_unlock(&dlm->spinlock);
- out += snprintf(db->buf + out, db->len - out,
- "Total on list: %ld\n", total);
+ out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
return out;
}
@@ -463,15 +414,15 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_purgelist_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_purgelist_print(dlm, db);
+ i_size_write(inode, debug_purgelist_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
@@ -480,42 +431,39 @@ bail:
static const struct file_operations debug_purgelist_fops = {
.open = debug_purgelist_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - purge list funcs */
/* begin - debug mle funcs */
-static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
{
struct dlm_master_list_entry *mle;
struct hlist_head *bucket;
- struct hlist_node *list;
int i, out = 0;
- unsigned long total = 0, longest = 0, bktcnt;
+ unsigned long total = 0, longest = 0, bucket_count = 0;
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each(list, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry(mle, bucket, master_hash_node) {
++total;
- ++bktcnt;
- if (db->len - out < 200)
+ ++bucket_count;
+ if (len - out < 200)
continue;
- out += dump_mle(mle, db->buf + out, db->len - out);
+ out += dump_mle(mle, buf + out, len - out);
}
- longest = max(longest, bktcnt);
- bktcnt = 0;
+ longest = max(longest, bucket_count);
+ bucket_count = 0;
}
spin_unlock(&dlm->master_lock);
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Total: %ld, Longest: %ld\n", total, longest);
return out;
}
@@ -523,15 +471,15 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_mle_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_mle_print(dlm, db);
+ i_size_write(inode, debug_mle_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
@@ -540,9 +488,9 @@ bail:
static const struct file_operations debug_mle_fops = {
.open = debug_mle_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - debug mle funcs */
@@ -636,8 +584,14 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
spin_lock(&dlm->track_lock);
if (oldres)
track_list = &oldres->tracking;
- else
+ else {
track_list = &dlm->tracking_list;
+ if (list_empty(track_list)) {
+ dl = NULL;
+ spin_unlock(&dlm->track_lock);
+ goto bail;
+ }
+ }
list_for_each_entry(res, track_list, tracking) {
if (&res->tracking == &dlm->tracking_list)
@@ -660,6 +614,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
} else
dl = NULL;
+bail:
/* passed to seq_show */
return dl;
}
@@ -715,7 +670,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file)
goto bail;
}
- seq = (struct seq_file *) file->private_data;
+ seq = file->private_data;
seq->private = dl;
dlm_grab(dlm);
@@ -731,7 +686,7 @@ bail:
static int debug_lockres_release(struct inode *inode, struct file *file)
{
- struct seq_file *seq = (struct seq_file *)file->private_data;
+ struct seq_file *seq = file->private_data;
struct debug_lockres *dl = (struct debug_lockres *)seq->private;
if (dl->dl_res)
@@ -750,7 +705,7 @@ static const struct file_operations debug_lockres_fops = {
/* end - debug lockres funcs */
/* begin - debug state funcs */
-static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
{
int out = 0;
struct dlm_reco_node_data *node;
@@ -774,33 +729,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
}
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
- out += snprintf(db->buf + out, db->len - out,
- "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
+ out += snprintf(buf + out, len - out,
+ "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
+ dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Thread Pid: %d Node: %d State: %s\n",
- dlm->dlm_thread_task->pid, dlm->node_num, state);
+ task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state);
/* Number of Joins: xxx Joining Node: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Number of Joins: %d Joining Node: %d\n",
dlm->num_joins, dlm->joining_node);
/* Domain Map: xx xx xx */
- out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+ out += snprintf(buf + out, len - out, "Domain Map: ");
out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
+
+ /* Exit Domain Map: xx xx xx */
+ out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+ out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Live Map: xx xx xx */
- out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+ out += snprintf(buf + out, len - out, "Live Map: ");
out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Lock Resources: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Lock Resources: %d (%d)\n",
atomic_read(&dlm->res_cur_count),
atomic_read(&dlm->res_tot_count));
@@ -812,29 +775,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
cur_mles += atomic_read(&dlm->mle_cur_count[i]);
/* MLEs: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"MLEs: %d (%d)\n", cur_mles, tot_mles);
/* Blocking: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Blocking: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
/* Mastery: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Mastery: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
/* Migration: xxx (xxx) */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
" Migration: %d (%d)\n",
atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
/* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Lists: Dirty=%s Purge=%s PendingASTs=%s "
"PendingBASTs=%s\n",
(list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
@@ -843,12 +806,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
(list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
/* Purge Count: xxx Refs: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Purge Count: %d Refs: %d\n", dlm->purge_count,
atomic_read(&dlm->dlm_refs.refcount));
/* Dead Node: xxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Dead Node: %d\n", dlm->reco.dead_node);
/* What about DLM_RECO_STATE_FINALIZE? */
@@ -858,19 +821,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
state = "INACTIVE";
/* Recovery Pid: xxxx Master: xxx State: xxxx */
- out += snprintf(db->buf + out, db->len - out,
+ out += snprintf(buf + out, len - out,
"Recovery Pid: %d Master: %d State: %s\n",
- dlm->dlm_reco_thread_task->pid,
+ task_pid_nr(dlm->dlm_reco_thread_task),
dlm->reco.new_master, state);
/* Recovery Map: xx xx */
- out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+ out += snprintf(buf + out, len - out, "Recovery Map: ");
out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
- db->buf + out, db->len - out);
- out += snprintf(db->buf + out, db->len - out, "\n");
+ buf + out, len - out);
+ out += snprintf(buf + out, len - out, "\n");
/* Recovery Node State: */
- out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+ out += snprintf(buf + out, len - out, "Recovery Node State:\n");
list_for_each_entry(node, &dlm->reco.node_data, list) {
switch (node->state) {
case DLM_RECO_NODE_DATA_INIT:
@@ -898,7 +861,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
state = "BAD";
break;
}
- out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+ out += snprintf(buf + out, len - out, "\t%u - %s\n",
node->node_num, state);
}
@@ -910,15 +873,15 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
static int debug_state_open(struct inode *inode, struct file *file)
{
struct dlm_ctxt *dlm = inode->i_private;
- struct debug_buffer *db = NULL;
+ char *buf = NULL;
- db = debug_buffer_allocate();
- if (!db)
+ buf = (char *) get_zeroed_page(GFP_NOFS);
+ if (!buf)
goto bail;
- db->len = debug_state_print(dlm, db);
+ i_size_write(inode, debug_state_print(dlm, buf, PAGE_SIZE - 1));
- file->private_data = db;
+ file->private_data = buf;
return 0;
bail:
@@ -927,9 +890,9 @@ bail:
static const struct file_operations debug_state_fops = {
.open = debug_state_open,
- .release = debug_buffer_release,
- .read = debug_buffer_read,
- .llseek = debug_buffer_llseek,
+ .release = debug_release,
+ .read = debug_read,
+ .llseek = generic_file_llseek,
};
/* end - debug state funcs */
@@ -993,14 +956,10 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
if (dc) {
- if (dc->debug_purgelist_dentry)
- debugfs_remove(dc->debug_purgelist_dentry);
- if (dc->debug_mle_dentry)
- debugfs_remove(dc->debug_mle_dentry);
- if (dc->debug_lockres_dentry)
- debugfs_remove(dc->debug_lockres_dentry);
- if (dc->debug_state_dentry)
- debugfs_remove(dc->debug_state_dentry);
+ debugfs_remove(dc->debug_purgelist_dentry);
+ debugfs_remove(dc->debug_mle_dentry);
+ debugfs_remove(dc->debug_lockres_dentry);
+ debugfs_remove(dc->debug_state_dentry);
dlm_debug_put(dc);
}
}
@@ -1031,8 +990,7 @@ bail:
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
- if (dlm->dlm_debugfs_subroot)
- debugfs_remove(dlm->dlm_debugfs_subroot);
+ debugfs_remove(dlm->dlm_debugfs_subroot);
}
/* debugfs root */
@@ -1048,7 +1006,6 @@ int dlm_create_debugfs_root(void)
void dlm_destroy_debugfs_root(void)
{
- if (dlm_debugfs_root)
- debugfs_remove(dlm_debugfs_root);
+ debugfs_remove(dlm_debugfs_root);
}
#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 8c686d22f9c..1f27c4812d1 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -37,11 +37,6 @@ struct dlm_debug_ctxt {
struct dentry *debug_purgelist_dentry;
};
-struct debug_buffer {
- int len;
- char *buf;
-};
-
struct debug_lockres {
int dl_len;
char *dl_buf;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4..39efc5057a3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
#include "dlmdomain.h"
#include "dlmdebug.h"
-#include "dlmver.h"
-
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
@@ -128,10 +126,16 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* will have a negotiated version with the same major number and a minor
* number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
* be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ * - Message DLM_QUERY_REGION added to support global heartbeat
+ * - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ * - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 0,
+ .pv_minor = 2,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data);
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -149,16 +155,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- if (!hlist_unhashed(&lockres->hash_node)) {
- hlist_del_init(&lockres->hash_node);
- dlm_lockres_put(lockres);
- }
+ if (hlist_unhashed(&res->hash_node))
+ return;
+
+ mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ hlist_del_init(&res->hash_node);
+ dlm_lockres_put(res);
}
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct hlist_head *bucket;
struct qstr *q;
@@ -172,6 +180,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
dlm_lockres_get(res);
hlist_add_head(&res->hash_node, bucket);
+
+ mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -180,17 +191,15 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
unsigned int hash)
{
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct dlm_lock_resource *res;
- mlog_entry("%.*s\n", len, name);
+ mlog(0, "%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
bucket = dlm_lockres_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- struct dlm_lock_resource *res = hlist_entry(list,
- struct dlm_lock_resource, hash_node);
+ hlist_for_each_entry(res, bucket, hash_node) {
if (res->lockname.name[0] != name[0])
continue;
if (unlikely(res->lockname.len != len))
@@ -216,7 +225,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
{
struct dlm_lock_resource *res = NULL;
- mlog_entry("%.*s\n", len, name);
+ mlog(0, "%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
@@ -249,22 +258,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
{
- struct dlm_ctxt *tmp = NULL;
- struct list_head *iter;
+ struct dlm_ctxt *tmp;
assert_spin_locked(&dlm_domain_lock);
/* tmp->name here is always NULL terminated,
* but domain may not be! */
- list_for_each(iter, &dlm_domains) {
- tmp = list_entry (iter, struct dlm_ctxt, list);
+ list_for_each_entry(tmp, &dlm_domains, list) {
if (strlen(tmp->name) == len &&
memcmp(tmp->name, domain, len)==0)
- break;
- tmp = NULL;
+ return tmp;
}
- return tmp;
+ return NULL;
}
/* For null terminated domain strings ONLY */
@@ -306,9 +312,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
if (dlm->master_hash)
dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
- if (dlm->name)
- kfree(dlm->name);
-
+ kfree(dlm->name);
kfree(dlm);
}
@@ -355,25 +359,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
* you shouldn't trust your pointer. */
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
{
- struct list_head *iter;
- struct dlm_ctxt *target = NULL;
+ struct dlm_ctxt *target;
+ struct dlm_ctxt *ret = NULL;
spin_lock(&dlm_domain_lock);
- list_for_each(iter, &dlm_domains) {
- target = list_entry (iter, struct dlm_ctxt, list);
-
+ list_for_each_entry(target, &dlm_domains, list) {
if (target == dlm) {
__dlm_get(target);
+ ret = target;
break;
}
-
- target = NULL;
}
spin_unlock(&dlm_domain_lock);
- return target;
+ return ret;
}
int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -443,19 +444,21 @@ redo_bucket:
dropped = dlm_empty_lockres(dlm, res);
spin_lock(&res->spinlock);
- __dlm_lockres_calc_usage(dlm, res);
- iter = res->hash_node.next;
+ if (dropped)
+ __dlm_lockres_calc_usage(dlm, res);
+ else
+ iter = res->hash_node.next;
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
- if (dropped)
+ if (dropped) {
+ cond_resched_lock(&dlm->spinlock);
goto redo_bucket;
+ }
}
cond_resched_lock(&dlm->spinlock);
num += n;
- mlog(0, "%s: touched %d lockreses in bucket %d "
- "(tot=%d)\n", dlm->name, n, i, num);
}
spin_unlock(&dlm->spinlock);
wake_up(&dlm->dlm_thread_wq);
@@ -482,6 +485,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
return ret;
}
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ unsigned int node;
+ struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ node = exit_msg->node_idx;
+ mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+ spin_lock(&dlm->spinlock);
+ set_bit(node, dlm->exit_domain_map);
+ spin_unlock(&dlm->spinlock);
+
+ dlm_put(dlm);
+
+ return 0;
+}
+
static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
{
/* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -507,17 +532,17 @@ again:
static void __dlm_print_nodes(struct dlm_ctxt *dlm)
{
- int node = -1;
+ int node = -1, num = 0;
assert_spin_locked(&dlm->spinlock);
- printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
-
+ printk("( ");
while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
printk("%d ", node);
+ ++num;
}
- printk("\n");
+ printk(") %u nodes\n", num);
}
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -527,17 +552,17 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
unsigned int node;
struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
- mlog_entry("%p %u %p", msg, len, data);
+ mlog(0, "%p %u %p", msg, len, data);
if (!dlm_grab(dlm))
return 0;
node = exit_msg->node_idx;
- printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
-
spin_lock(&dlm->spinlock);
clear_bit(node, dlm->domain_map);
+ clear_bit(node, dlm->exit_domain_map);
+ printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
__dlm_print_nodes(dlm);
/* notify anything attached to the heartbeat events */
@@ -550,27 +575,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
unsigned int node)
{
int status;
struct dlm_exit_domain leave_msg;
- mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
- node, dlm->name, dlm->node_num);
+ mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+ msg_type, node);
memset(&leave_msg, 0, sizeof(leave_msg));
leave_msg.node_idx = dlm->node_num;
- status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
- &leave_msg, sizeof(leave_msg), node,
- NULL);
-
- mlog(0, "status return %d from o2net_send_message\n", status);
+ status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+ sizeof(leave_msg), node, NULL);
+ if (status < 0)
+ mlog(ML_ERROR, "Error %d sending domain exit message %u "
+ "to node %u on domain %s\n", status, msg_type, node,
+ dlm->name);
return status;
}
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+ int node = -1;
+
+ /* Support for begin exit domain was added in 1.2 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor < 2)
+ return;
+
+ /*
+ * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+ * informational. Meaning if a node does not receive the message,
+ * so be it.
+ */
+ spin_lock(&dlm->spinlock);
+ while (1) {
+ node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+ if (node >= O2NM_MAX_NODES)
+ break;
+ if (node == dlm->node_num)
+ continue;
+
+ spin_unlock(&dlm->spinlock);
+ dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+ spin_lock(&dlm->spinlock);
+ }
+ spin_unlock(&dlm->spinlock);
+}
static void dlm_leave_domain(struct dlm_ctxt *dlm)
{
@@ -596,7 +650,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
clear_node = 1;
- status = dlm_send_one_domain_exit(dlm, node);
+ status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+ node);
if (status < 0 &&
status != -ENOPROTOOPT &&
status != -ENOTCONN) {
@@ -671,6 +726,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
if (leave) {
mlog(0, "shutting down domain %s\n", dlm->name);
+ dlm_begin_exit_domain(dlm);
/* We changed dlm state, notify the thread */
dlm_kick_thread(dlm, NULL);
@@ -691,6 +747,8 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
dlm_mark_domain_leaving(dlm);
dlm_leave_domain(dlm);
+ printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
+ dlm_force_free_mles(dlm);
dlm_complete_dlm_shutdown(dlm);
}
dlm_put(dlm);
@@ -748,7 +806,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
union dlm_query_join_response response;
response.packet = *packet;
- *wire = cpu_to_be32(response.intval);
+ *wire = be32_to_cpu(response.intval);
}
static void dlm_query_join_wire_to_packet(u32 wire,
@@ -901,10 +959,19 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* domain. Set him in the map and clean up our
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
+
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "dlm recovery is ongoing, disallow join\n");
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+ return -EAGAIN;
+ }
+
set_bit(assert->node_idx, dlm->domain_map);
+ clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+ printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
assert->node_idx, dlm->name);
__dlm_print_nodes(dlm);
@@ -918,6 +985,371 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+ struct dlm_query_region *qr,
+ char *local, int locallen)
+{
+ char *remote = qr->qr_regions;
+ char *l, *r;
+ int localnr, i, j, foundit;
+ int status = 0;
+
+ if (!o2hb_global_heartbeat_active()) {
+ if (qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+ "heartbeat enabled but local node %d does not\n",
+ qr->qr_domain, qr->qr_node, dlm->node_num);
+ status = -EINVAL;
+ }
+ goto bail;
+ }
+
+ if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Local node %d has global "
+ "heartbeat enabled but joining node %d does not\n",
+ qr->qr_domain, dlm->node_num, qr->qr_node);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
+ localnr = o2hb_get_all_regions(local, (u8)localnr);
+
+ /* compare local regions with remote */
+ l = local;
+ for (i = 0; i < localnr; ++i) {
+ foundit = 0;
+ r = remote;
+ for (j = 0; j <= qr->qr_numregions; ++j) {
+ if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in local node %d but not in joining node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+ dlm->node_num, qr->qr_node);
+ goto bail;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ /* compare remote with local regions */
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ foundit = 0;
+ l = local;
+ for (j = 0; j < localnr; ++j) {
+ if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in joining node %d but not in local node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+ qr->qr_node, dlm->node_num);
+ goto bail;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+bail:
+ return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_region *qr = NULL;
+ int status, ret = 0, i;
+ char *p;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+ if (!qr) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ qr->qr_node = dlm->node_num;
+ qr->qr_namelen = strlen(dlm->name);
+ memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+ /* if local hb, the numregions will be zero */
+ if (o2hb_global_heartbeat_active())
+ qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+ O2NM_MAX_REGIONS);
+
+ p = qr->qr_regions;
+ for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending regions to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+ sizeof(struct dlm_query_region),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+ ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qr);
+ return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_region *qr;
+ struct dlm_ctxt *dlm = NULL;
+ char *local = NULL;
+ int status = 0;
+
+ qr = (struct dlm_query_region *) msg->buf;
+
+ mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+ qr->qr_domain);
+
+ /* buffer used in dlm_mast_regions() */
+ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+ if (!local)
+ return -ENOMEM;
+
+ status = -EINVAL;
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "before join domain\n", qr->qr_node, qr->qr_domain);
+ goto out_domain_lock;
+ }
+
+ spin_lock(&dlm->spinlock);
+ if (dlm->joining_node != qr->qr_node) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+ dlm->joining_node);
+ goto out_dlm_lock;
+ }
+
+ /* Support for global heartbeat was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but active dlm protocol is %d.%d\n", qr->qr_node,
+ qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto out_dlm_lock;
+ }
+
+ status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
+
+out_dlm_lock:
+ spin_unlock(&dlm->spinlock);
+
+out_domain_lock:
+ spin_unlock(&dlm_domain_lock);
+
+ kfree(local);
+
+ return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+ struct o2nm_node *local;
+ struct dlm_node_info *remote;
+ int i, j;
+ int status = 0;
+
+ for (j = 0; j < qn->qn_numnodes; ++j)
+ mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+ &(qn->qn_nodes[j].ni_ipv4_address),
+ ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+ for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+ local = o2nm_get_node_by_num(i);
+ remote = NULL;
+ for (j = 0; j < qn->qn_numnodes; ++j) {
+ if (qn->qn_nodes[j].ni_nodenum == i) {
+ remote = &(qn->qn_nodes[j]);
+ break;
+ }
+ }
+
+ if (!local && !remote)
+ continue;
+
+ if ((local && !remote) || (!local && remote))
+ status = -EINVAL;
+
+ if (!status &&
+ ((remote->ni_nodenum != local->nd_num) ||
+ (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+ (remote->ni_ipv4_address != local->nd_ipv4_address)))
+ status = -EINVAL;
+
+ if (status) {
+ if (remote && !local)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in joining node %d but not in "
+ "local node %d\n", qn->qn_domain,
+ remote->ni_nodenum,
+ &(remote->ni_ipv4_address),
+ ntohs(remote->ni_ipv4_port),
+ qn->qn_nodenum, dlm->node_num);
+ if (local && !remote)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in local node %d but not in "
+ "joining node %d\n", qn->qn_domain,
+ local->nd_num, &(local->nd_ipv4_address),
+ ntohs(local->nd_ipv4_port),
+ dlm->node_num, qn->qn_nodenum);
+ BUG_ON((!local && !remote));
+ }
+
+ if (local)
+ o2nm_node_put(local);
+ }
+
+ return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_nodeinfo *qn = NULL;
+ struct o2nm_node *node;
+ int ret = 0, status, count, i;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+ if (!qn) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+ node = o2nm_get_node_by_num(i);
+ if (!node)
+ continue;
+ qn->qn_nodes[count].ni_nodenum = node->nd_num;
+ qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+ qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+ mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+ &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+ ++count;
+ o2nm_node_put(node);
+ }
+
+ qn->qn_nodenum = dlm->node_num;
+ qn->qn_numnodes = count;
+ qn->qn_namelen = strlen(dlm->name);
+ memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending nodeinfo to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ qn, sizeof(struct dlm_query_nodeinfo),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qn);
+ return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_nodeinfo *qn;
+ struct dlm_ctxt *dlm = NULL;
+ int locked = 0, status = -EINVAL;
+
+ qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+ mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+ qn->qn_domain);
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+ "join domain\n", qn->qn_nodenum, qn->qn_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qn->qn_nodenum) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+ "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for node query was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+ "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+ qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_nodes(dlm, qn);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
@@ -962,7 +1394,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
&cancel_msg, sizeof(cancel_msg), node,
NULL);
if (status < 0) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
@@ -1029,10 +1463,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
- sizeof(join_msg), node,
- &join_resp);
+ sizeof(join_msg), node, &join_resp);
if (status < 0 && status != -ENOPROTOOPT) {
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+ node);
goto bail;
}
dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1090,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
+ int ret;
struct dlm_assert_joined assert_msg;
mlog(0, "Sending join assert to node %u\n", node);
@@ -1101,9 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
&assert_msg, sizeof(assert_msg), node,
- NULL);
+ &ret);
if (status < 0)
- mlog_errno(status);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+ node);
+ else
+ status = ret;
return status;
}
@@ -1177,7 +1617,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
struct domain_join_ctxt *ctxt;
enum dlm_query_join_response_code response = JOIN_DISALLOW;
- mlog_entry("%p", dlm);
+ mlog(0, "%p", dlm);
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (!ctxt) {
@@ -1233,6 +1673,21 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
+ /* Support for global heartbeat and node info was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major > 1 ||
+ dlm->dlm_locking_proto.pv_minor > 0) {
+ status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
/* Joined state *must* be set before the joining node
@@ -1247,8 +1702,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
bail:
spin_lock(&dlm->spinlock);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
- if (!status)
+ if (!status) {
+ printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
__dlm_print_nodes(dlm);
+ }
spin_unlock(&dlm->spinlock);
if (ctxt) {
@@ -1269,8 +1726,8 @@ bail:
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
{
- o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
- o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_up);
+ o2hb_unregister_callback(dlm->name, &dlm->dlm_hb_down);
o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
}
@@ -1282,13 +1739,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
- status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
- status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
+ status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
@@ -1398,6 +1855,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
if (status)
goto bail;
+ status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+ sizeof(struct dlm_exit_domain),
+ dlm_begin_exit_domain_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
@@ -1421,19 +1885,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- status = dlm_debug_init(dlm);
+ status = dlm_launch_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = dlm_launch_thread(dlm);
+ status = dlm_launch_recovery_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = dlm_launch_recovery_thread(dlm);
+ status = dlm_debug_init(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1516,7 +1980,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
goto leave;
}
- dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+ dlm->name = kstrdup(domain, GFP_KERNEL);
if (dlm->name == NULL) {
mlog_errno(-ENOMEM);
kfree(dlm);
@@ -1550,7 +2014,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
for (i = 0; i < DLM_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
- strcpy(dlm->name, domain);
dlm->key = key;
dlm->node_num = o2nm_this_node();
@@ -1571,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
- INIT_LIST_HEAD(&dlm->reco.received);
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
@@ -1665,19 +2127,12 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
struct dlm_ctxt *dlm = NULL;
struct dlm_ctxt *new_ctxt = NULL;
- if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+ if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
ret = -ENAMETOOLONG;
mlog(ML_ERROR, "domain name length too long\n");
goto leave;
}
- if (!o2hb_check_local_node_heartbeating()) {
- mlog(ML_ERROR, "the local node has not been configured, or is "
- "not heartbeating\n");
- ret = -EPROTO;
- goto leave;
- }
-
mlog(0, "register called for domain \"%s\"\n", domain);
retry:
@@ -1703,6 +2158,7 @@ retry:
}
if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+ spin_unlock(&dlm_domain_lock);
mlog(ML_ERROR,
"Requested locking protocol version is not "
"compatible with already registered domain "
@@ -1799,7 +2255,21 @@ static int dlm_register_net_handlers(void)
sizeof(struct dlm_cancel_join),
dlm_cancel_join_handler,
NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+ sizeof(struct dlm_query_region),
+ dlm_query_region_handler,
+ NULL, NULL, &dlm_join_handlers);
+
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ sizeof(struct dlm_query_nodeinfo),
+ dlm_query_nodeinfo_handler,
+ NULL, NULL, &dlm_join_handlers);
bail:
if (status < 0)
dlm_unregister_net_handlers();
@@ -1823,13 +2293,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num)
{
- struct list_head *iter;
struct dlm_eviction_cb *cb;
down_read(&dlm_callback_sem);
- list_for_each(iter, &dlm->dlm_eviction_callbacks) {
- cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
-
+ list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
cb->ec_func(node_num, cb->ec_data);
}
up_read(&dlm_callback_sem);
@@ -1866,8 +2333,6 @@ static int __init dlm_init(void)
{
int status;
- dlm_print_version();
-
status = dlm_init_mle_cache();
if (status) {
mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -1917,6 +2382,7 @@ static void __exit dlm_exit (void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
module_init(dlm_init);
module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 73333777267..66c2a491f68 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -52,7 +52,7 @@
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
-static struct kmem_cache *dlm_lock_cache = NULL;
+static struct kmem_cache *dlm_lock_cache;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
@@ -91,21 +91,19 @@ void dlm_destroy_lock_cache(void)
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
- struct list_head *iter;
struct dlm_lock *tmplock;
- list_for_each(iter, &res->granted) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->granted, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
- list_for_each(iter, &res->converting) {
- tmplock = list_entry(iter, struct dlm_lock, list);
-
+ list_for_each_entry(tmplock, &res->converting, list) {
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
+ if (!dlm_lock_compatible(tmplock->ml.convert_type,
+ lock->ml.type))
+ return 0;
}
return 1;
@@ -125,7 +123,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
int call_ast = 0, kick_thread = 0;
enum dlm_status status = DLM_NORMAL;
- mlog_entry("type=%d\n", lock->ml.type);
+ mlog(0, "type=%d\n", lock->ml.type);
spin_lock(&res->spinlock);
/* if called from dlm_create_lock_handler, need to
@@ -175,15 +173,12 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
lock->ml.node);
}
} else {
+ status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
}
}
- /* reduce the inflight count, this may result in the lockres
- * being purged below during calc_usage */
- if (lock->ml.node == dlm->node_num)
- dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -224,14 +219,20 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
enum dlm_status status = DLM_DENIED;
int lockres_changed = 1;
- mlog_entry("type=%d\n", lock->ml.type);
- mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+ mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+ lock->ml.type, res->lockname.len,
res->lockname.name, flags);
+ /*
+ * Wait if resource is getting recovered, remastered, etc.
+ * If the resource was remastered and new owner is self, then exit.
+ */
spin_lock(&res->spinlock);
-
- /* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
+ if (res->owner == dlm->node_num) {
+ spin_unlock(&res->spinlock);
+ return DLM_RECOVERING;
+ }
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
@@ -305,8 +306,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
int tmpret, status = 0;
enum dlm_status ret;
- mlog_entry_void();
-
memset(&create, 0, sizeof(create));
create.node_idx = dlm->node_num;
create.requested_type = lock->ml.type;
@@ -318,25 +317,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
+ ret = status;
if (ret == DLM_REJECTED) {
- mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
- "no longer owned by %u. that node is coming back "
- "up currently.\n", dlm->name, create.namelen,
+ mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+ "owned by node %u. That node is coming back up "
+ "currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
- mlog_errno(tmpret);
- if (dlm_is_host_down(tmpret)) {
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+ "node %u\n", dlm->name, create.namelen, create.name,
+ tmpret, res->owner);
+ if (dlm_is_host_down(tmpret))
ret = DLM_RECOVERING;
- mlog(0, "node %u died so returning DLM_RECOVERING "
- "from lock message!\n", res->owner);
- } else {
+ else
ret = dlm_err_to_dlm_status(tmpret);
- }
}
return ret;
@@ -429,7 +426,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lock *lock;
int kernel_allocated = 0;
- lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+ lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
@@ -437,7 +434,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
- kfree(lock);
+ kmem_cache_free(dlm_lock_cache, lock);
return NULL;
}
kernel_allocated = 1;
@@ -472,8 +469,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
BUG_ON(!dlm);
- mlog_entry_void();
-
if (!dlm_grab(dlm))
return DLM_REJECTED;
@@ -717,18 +712,10 @@ retry_lock:
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
- mlog(0, "retrying lock with migration/"
- "recovery/in progress\n");
msleep(100);
- /* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
-
- mlog(0, "%s: got RECOVERING "
- "for $RECOVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
@@ -740,6 +727,14 @@ retry_lock:
}
}
+ /* Inflight taken in dlm_get_lock_resource() is dropped here */
+ spin_lock(&res->spinlock);
+ dlm_lockres_drop_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
+
+ dlm_lockres_calc_usage(dlm, res);
+ dlm_kick_thread(dlm, res);
+
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb9..82abf0cc9a1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-static struct kmem_cache *dlm_lockres_cache = NULL;
-static struct kmem_cache *dlm_lockname_cache = NULL;
-static struct kmem_cache *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
@@ -342,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
{
struct dlm_master_list_entry *tmpmle;
struct hlist_head *bucket;
- struct hlist_node *list;
unsigned int hash;
assert_spin_locked(&dlm->master_lock);
hash = dlm_lockid_hash(name, namelen);
bucket = dlm_master_hash(dlm, hash);
- hlist_for_each(list, bucket) {
- tmpmle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
+ hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
continue;
dlm_get_mle(tmpmle);
@@ -426,8 +423,6 @@ static void dlm_mle_release(struct kref *kref)
struct dlm_master_list_entry *mle;
struct dlm_ctxt *dlm;
- mlog_entry_void();
-
mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
dlm = mle->dlm;
@@ -477,11 +472,15 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache)
+ if (dlm_lockname_cache) {
kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
+ }
- if (dlm_lockres_cache)
+ if (dlm_lockres_cache) {
kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
+ }
}
static void dlm_lockres_release(struct kref *kref)
@@ -511,8 +510,6 @@ static void dlm_lockres_release(struct kref *kref)
atomic_dec(&dlm->res_cur_count);
- dlm_put(dlm);
-
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
@@ -584,9 +581,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
+ res->inflight_assert_workers = 0;
- /* put in dlm_lockres_release */
- dlm_grab(dlm);
res->dlm = dlm;
kref_init(&res->refs);
@@ -617,13 +613,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
{
struct dlm_lock_resource *res = NULL;
- res = (struct dlm_lock_resource *)
- kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
+ res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
if (!res)
goto error;
- res->lockname.name = (char *)
- kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+ res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
if (!res->lockname.name)
goto error;
@@ -639,42 +633,94 @@ error:
return NULL;
}
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int new_lockres,
- const char *file,
- int line)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
{
- if (!new_lockres)
- assert_spin_locked(&res->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, int bit)
+{
+ assert_spin_locked(&res->spinlock);
+
+ mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+ res->lockname.name, bit, __builtin_return_address(0));
+
+ clear_bit(bit, res->refmap);
+}
+
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
- if (!test_bit(dlm->node_num, res->refmap)) {
- BUG_ON(res->inflight_locks != 0);
- dlm_lockres_set_refmap_bit(dlm->node_num, res);
- }
res->inflight_locks++;
- mlog(0, "%s:%.*s: inflight++: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
+
+ mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
}
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- const char *file,
- int line)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_locks == 0);
+
res->inflight_locks--;
- mlog(0, "%s:%.*s: inflight--: now %u\n",
- dlm->name, res->lockname.len, res->lockname.name,
- res->inflight_locks);
- if (res->inflight_locks == 0)
- dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+
+ mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+ res->lockname.len, res->lockname.name, res->inflight_locks,
+ __builtin_return_address(0));
+
wake_up(&res->wq);
}
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ res->inflight_assert_workers++;
+ mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ BUG_ON(res->inflight_assert_workers == 0);
+ res->inflight_assert_workers--;
+ mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_drop_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
/*
* lookup a lock resource by name.
* may already exist in the hashtable.
@@ -705,7 +751,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
- int drop_inflight_if_nonlocal = 0;
BUG_ON(!lockid);
@@ -717,36 +762,33 @@ lookup:
spin_lock(&dlm->spinlock);
tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
- int dropping_ref = 0;
-
spin_unlock(&dlm->spinlock);
-
spin_lock(&tmpres->spinlock);
- /* We wait for the other thread that is mastering the resource */
+ /* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
}
- if (tmpres->owner == dlm->node_num) {
- BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
- dlm_lockres_grab_inflight_ref(dlm, tmpres);
- } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
- dropping_ref = 1;
- spin_unlock(&tmpres->spinlock);
-
- /* wait until done messaging the master, drop our ref to allow
- * the lockres to be purged, start over. */
- if (dropping_ref) {
- spin_lock(&tmpres->spinlock);
- __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+ /* Wait on the resource purge to complete before continuing */
+ if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+ BUG_ON(tmpres->owner == dlm->node_num);
+ __dlm_wait_on_lockres_flags(tmpres,
+ DLM_LOCK_RES_DROPPING_REF);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
- mlog(0, "found in hash!\n");
+ /* Grab inflight ref to pin the resource */
+ dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+ spin_unlock(&tmpres->spinlock);
if (res)
dlm_lockres_put(res);
res = tmpres;
@@ -757,8 +799,7 @@ lookup:
spin_unlock(&dlm->spinlock);
mlog(0, "allocating a new resource\n");
/* nothing found and we need to allocate one. */
- alloc_mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!alloc_mle)
goto leave;
res = dlm_new_lockres(dlm, lockid, namelen);
@@ -817,7 +858,7 @@ lookup:
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
mle = NULL;
- /* this is lame, but we cant wait on either
+ /* this is lame, but we can't wait on either
* the mle or lockres waitqueue here */
if (mig)
msleep(100);
@@ -838,8 +879,8 @@ lookup:
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
@@ -852,12 +893,11 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* since this lockres is new it doesnt not require the spinlock */
- dlm_lockres_grab_inflight_ref_new(dlm, res);
- /* if this node does not become the master make sure to drop
- * this inflight reference below */
- drop_inflight_if_nonlocal = 1;
+ /* Grab inflight ref to pin the resource */
+ spin_lock(&res->spinlock);
+ dlm_lockres_grab_inflight_ref(dlm, res);
+ spin_unlock(&res->spinlock);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
@@ -873,8 +913,8 @@ redo_request:
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
- mlog(ML_NOTICE, "%s: recovery map is not empty, but "
- "must master $RECOVERY lock now\n", dlm->name);
+ mlog(0, "%s: Recovery map is not empty, but must "
+ "master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
@@ -892,8 +932,8 @@ redo_request:
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
- mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
- "recover before lock mastery can begin\n",
+ mlog(0, "%s: res %.*s, At least one node (%d) "
+ "to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
@@ -922,8 +962,8 @@ redo_request:
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
- mlog(0, "%s:%.*s: requests only up to %u but master "
- "is %u, keep going\n", dlm->name, namelen,
+ mlog(0, "%s: res %.*s, Requests only up to %u but "
+ "master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
@@ -933,13 +973,12 @@ wait:
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
- mlog(0, "%s:%.*s: node map changed, redo the "
- "master request now, blocked=%d\n",
- dlm->name, res->lockname.len,
+ mlog(0, "%s: res %.*s, Node map changed, redo the master "
+ "request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
- mlog(ML_ERROR, "%s:%.*s: spinning on "
- "dlm_wait_for_lock_mastery, blocked=%d\n",
+ mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+ "dlm_wait_for_lock_mastery, blocked = %d\n",
dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
@@ -949,7 +988,8 @@ wait:
goto redo_request;
}
- mlog(0, "lockres mastered by %u\n", res->owner);
+ mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
@@ -961,8 +1001,6 @@ wait:
wake_waiters:
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
- dlm_lockres_drop_inflight_ref(dlm, res);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
@@ -1435,9 +1473,7 @@ way_up_top:
}
if (res->owner == dlm->node_num) {
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name, request->node_idx);
- dlm_lockres_set_refmap_bit(request->node_idx, res);
+ dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
response = DLM_MASTER_RESP_YES;
if (mle)
@@ -1502,10 +1538,8 @@ way_up_top:
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
- dlm_lockres_set_refmap_bit(request->node_idx, res);
- mlog(0, "%s:%.*s: setting bit %u in refmap\n",
- dlm->name, namelen, name,
- request->node_idx);
+ dlm_lockres_set_refmap_bit(dlm, res,
+ request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
@@ -1542,8 +1576,7 @@ way_up_top:
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
- mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
response = DLM_MASTER_RESP_ERROR;
mlog_errno(-ENOMEM);
@@ -1608,7 +1641,8 @@ send_response:
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
- }
+ } else
+ dlm_lockres_grab_inflight_worker(dlm, res);
} else {
if (res)
dlm_lockres_put(res);
@@ -1666,7 +1700,9 @@ again:
tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
&assert, sizeof(assert), to, &r);
if (tmpret < 0) {
- mlog(0, "assert_master returned %d!\n", tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", tmpret,
+ DLM_ASSERT_MASTER_MSG, dlm->key, to);
if (!dlm_is_host_down(tmpret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
BUG();
@@ -1710,7 +1746,7 @@ again:
"lockres, set the bit in the refmap\n",
namelen, lockname, to);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(to, res);
+ dlm_lockres_set_refmap_bit(dlm, res, to);
spin_unlock(&res->spinlock);
}
}
@@ -1875,7 +1911,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
ok:
spin_unlock(&res->spinlock);
}
- spin_unlock(&dlm->spinlock);
// mlog(0, "woo! got an assert_master from node %u!\n",
// assert->node_idx);
@@ -1893,8 +1928,10 @@ ok:
* up nodes that this node contacted */
while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
- if (nn != dlm->node_num && nn != assert->node_idx)
+ if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
+ break;
+ }
}
}
mle->master = assert->node_idx;
@@ -1926,7 +1963,6 @@ ok:
/* master is known, detach if not already detached.
* ensures that only one assert_master call will happen
* on this mle. */
- spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1995,6 @@ ok:
__dlm_put_mle(mle);
}
spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
} else if (res) {
if (res->owner != assert->node_idx) {
mlog(0, "assert_master from %u, but current "
@@ -1967,6 +2002,7 @@ ok:
res->owner, namelen, name);
}
}
+ spin_unlock(&dlm->spinlock);
done:
ret = 0;
@@ -2026,7 +2062,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
int ignore_higher, u8 request_from, u32 flags)
{
struct dlm_work_item *item;
- item = kzalloc(sizeof(*item), GFP_NOFS);
+ item = kzalloc(sizeof(*item), GFP_ATOMIC);
if (!item)
return -ENOMEM;
@@ -2121,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
dlm_lockres_release_ast(dlm, res);
put:
+ dlm_lockres_drop_inflight_worker(dlm, res);
+
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
@@ -2197,8 +2235,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
- mlog(0, "%s:%.*s: sending deref to %d\n",
- dlm->name, namelen, lockname, res->owner);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
@@ -2207,12 +2243,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+ dlm->name, namelen, lockname, ret, res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
- mlog(ML_ERROR,"while dropping ref on %s:%.*s "
- "(master=%u) got %d.\n", dlm->name, namelen,
- lockname, res->owner, r);
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
BUG();
}
@@ -2268,7 +2304,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
else {
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
}
@@ -2328,7 +2364,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
- dlm_lockres_clear_refmap_bit(node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
@@ -2347,55 +2383,59 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
dlm_lockres_put(res);
}
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- int *numlocks)
+ struct dlm_lock_resource *res)
{
- int ret;
- int i;
- int count = 0;
+ enum dlm_lockres_list idx;
+ int nonlocal = 0, node_ref;
struct list_head *queue;
struct dlm_lock *lock;
+ u64 cookie;
assert_spin_locked(&res->spinlock);
- ret = -EINVAL;
- if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
- mlog(0, "cannot migrate lockres with unknown owner!\n");
- goto leave;
- }
+ /* delay migration when the lockres is in MIGRATING state */
+ if (res->state & DLM_LOCK_RES_MIGRATING)
+ return 0;
- if (res->owner != dlm->node_num) {
- mlog(0, "cannot migrate lockres this node doesn't own!\n");
- goto leave;
- }
+ if (res->owner != dlm->node_num)
+ return 0;
- ret = 0;
- queue = &res->granted;
- for (i = 0; i < 3; i++) {
+ for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+ queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
- ++count;
- if (lock->ml.node == dlm->node_num) {
- mlog(0, "found a lock owned by this node still "
- "on the %s queue! will not migrate this "
- "lockres\n", (i == 0 ? "granted" :
- (i == 1 ? "converting" :
- "blocked")));
- ret = -ENOTEMPTY;
- goto leave;
+ if (lock->ml.node != dlm->node_num) {
+ nonlocal++;
+ continue;
}
+ cookie = be64_to_cpu(lock->ml.cookie);
+ mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+ "%s list\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(cookie),
+ dlm_get_lock_cookie_seq(cookie),
+ dlm_list_in_text(idx));
+ return 0;
}
- queue++;
}
- *numlocks = count;
- mlog(0, "migrateable lockres having %d locks\n", *numlocks);
+ if (!nonlocal) {
+ node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (node_ref >= O2NM_MAX_NODES)
+ return 0;
+ }
-leave:
- return ret;
+ mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+
+ return 1;
}
/*
@@ -2404,8 +2444,7 @@ leave:
static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res,
- u8 target)
+ struct dlm_lock_resource *res, u8 target)
{
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *oldmle = NULL;
@@ -2414,39 +2453,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
const char *name;
unsigned int namelen;
int mle_added = 0;
- int numlocks;
int wake = 0;
if (!dlm_grab(dlm))
return -EINVAL;
+ BUG_ON(target == O2NM_MAX_NODES);
+
name = res->lockname.name;
namelen = res->lockname.len;
- mlog(0, "migrating %.*s to %u\n", namelen, name, target);
-
- /*
- * ensure this lockres is a proper candidate for migration
- */
- spin_lock(&res->spinlock);
- ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
- if (ret < 0) {
- spin_unlock(&res->spinlock);
- goto leave;
- }
- spin_unlock(&res->spinlock);
-
- /* no work to do */
- if (numlocks == 0) {
- mlog(0, "no locks were found on this lockres! done!\n");
- goto leave;
- }
-
- /*
- * preallocate up front
- * if this fails, abort
- */
+ mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+ target);
+ /* preallocate up front. if this fails, abort */
ret = -ENOMEM;
mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
@@ -2454,8 +2474,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
goto leave;
}
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
mlog_errno(ret);
goto leave;
@@ -2463,35 +2482,10 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
ret = 0;
/*
- * find a node to migrate the lockres to
- */
-
- mlog(0, "picking a migration node\n");
- spin_lock(&dlm->spinlock);
- /* pick a new node */
- if (!test_bit(target, dlm->domain_map) ||
- target >= O2NM_MAX_NODES) {
- target = dlm_pick_migration_target(dlm, res);
- }
- mlog(0, "node %u chosen for migration\n", target);
-
- if (target >= O2NM_MAX_NODES ||
- !test_bit(target, dlm->domain_map)) {
- /* target chosen is not alive */
- ret = -EINVAL;
- }
-
- if (ret) {
- spin_unlock(&dlm->spinlock);
- goto fail;
- }
-
- mlog(0, "continuing with target = %u\n", target);
-
- /*
* clear any existing master requests and
* add the migration mle to the list
*/
+ spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
@@ -2532,6 +2526,7 @@ fail:
dlm_put_mle(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
+ mle = NULL;
}
goto leave;
}
@@ -2575,6 +2570,9 @@ fail:
res->state &= ~DLM_LOCK_RES_MIGRATING;
wake = 1;
spin_unlock(&res->spinlock);
+ if (dlm_is_host_down(ret))
+ dlm_wait_for_node_death(dlm, target,
+ DLM_NODE_DEATH_WAIT_MAX);
goto leave;
}
@@ -2650,69 +2648,52 @@ leave:
if (wake)
wake_up(&res->wq);
- /* TODO: cleanup */
if (mres)
free_page((unsigned long)mres);
dlm_put(dlm);
- mlog(0, "returning %d\n", ret);
+ mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+ name, target, ret);
return ret;
}
#define DLM_MIGRATION_RETRY_MS 100
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
* and there should be no local locks left on locally mastered resources.
*
* Called with the dlm spinlock held, may drop it to do migration, but
* will re-acquire before exit.
*
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
int ret;
int lock_dropped = 0;
- int numlocks;
+ u8 target = O2NM_MAX_NODES;
+
+ assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
- if (res->owner != dlm->node_num) {
- if (!__dlm_lockres_unused(res)) {
- mlog(ML_ERROR, "%s:%.*s: this node is not master, "
- "trying to free this but locks remain\n",
- dlm->name, res->lockname.len, res->lockname.name);
- }
- spin_unlock(&res->spinlock);
- goto leave;
- }
+ if (dlm_is_lockres_migrateable(dlm, res))
+ target = dlm_pick_migration_target(dlm, res);
+ spin_unlock(&res->spinlock);
- /* No need to migrate a lockres having no locks */
- ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
- if (ret >= 0 && numlocks == 0) {
- spin_unlock(&res->spinlock);
+ if (target == O2NM_MAX_NODES)
goto leave;
- }
- spin_unlock(&res->spinlock);
/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
spin_unlock(&dlm->spinlock);
lock_dropped = 1;
- while (1) {
- ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
- if (ret >= 0)
- break;
- if (ret == -ENOTEMPTY) {
- mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
- res->lockname.len, res->lockname.name);
- BUG();
- }
-
- mlog(0, "lockres %.*s: migrate failed, "
- "retrying\n", res->lockname.len,
- res->lockname.name);
- msleep(DLM_MIGRATION_RETRY_MS);
- }
+ ret = dlm_migrate_lockres(dlm, res, target);
+ if (ret)
+ mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ target, ret);
spin_lock(&dlm->spinlock);
leave:
return lock_dropped;
@@ -2811,14 +2792,8 @@ again:
mlog(0, "trying again...\n");
goto again;
}
- /* now that we are sure the MIGRATING state is there, drop
- * the unneded state which blocked threads trying to DIRTY */
- spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
- BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
- res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
- spin_unlock(&res->spinlock);
+ ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
@@ -2829,9 +2804,21 @@ again:
spin_unlock(&dlm->spinlock);
/*
+ * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+ * another try; otherwise, we are sure the MIGRATING state is there,
+ * drop the unneded state which blocked threads trying to DIRTY
+ */
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+ res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+ if (!ret)
+ BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ spin_unlock(&res->spinlock);
+
+ /*
* at this point:
*
- * o the DLM_LOCK_RES_MIGRATING flag is set
+ * o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
@@ -2863,7 +2850,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
- dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
/* In a normal unlock, we would have added a
@@ -2884,61 +2872,61 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: node %u had a ref to this "
"migrating lockres, clearing\n", dlm->name,
res->lockname.len, res->lockname.name, bit);
- dlm_lockres_clear_refmap_bit(bit, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, bit);
}
bit++;
}
}
-/* for now this is not too intelligent. we will
- * need stats to make this do the right thing.
- * this just finds the first lock on one of the
- * queues and uses that node as the target. */
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- int i;
+ enum dlm_lockres_list idx;
struct list_head *queue = &res->granted;
struct dlm_lock *lock;
- int nodenum;
+ int noderef;
+ u8 nodenum = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
- spin_lock(&res->spinlock);
- for (i=0; i<3; i++) {
+ /* Go through all the locks */
+ for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+ queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
- /* up to the caller to make sure this node
- * is alive */
- if (lock->ml.node != dlm->node_num) {
- spin_unlock(&res->spinlock);
- return lock->ml.node;
- }
+ if (lock->ml.node == dlm->node_num)
+ continue;
+ if (test_bit(lock->ml.node, dlm->exit_domain_map))
+ continue;
+ nodenum = lock->ml.node;
+ goto bail;
}
- queue++;
}
- spin_unlock(&res->spinlock);
- mlog(0, "have not found a suitable target yet! checking domain map\n");
- /* ok now we're getting desperate. pick anyone alive. */
- nodenum = -1;
+ /* Go thru the refmap */
+ noderef = -1;
while (1) {
- nodenum = find_next_bit(dlm->domain_map,
- O2NM_MAX_NODES, nodenum+1);
- mlog(0, "found %d in domain map\n", nodenum);
- if (nodenum >= O2NM_MAX_NODES)
+ noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+ noderef + 1);
+ if (noderef >= O2NM_MAX_NODES)
break;
- if (nodenum != dlm->node_num) {
- mlog(0, "picking %d\n", nodenum);
- return nodenum;
- }
+ if (noderef == dlm->node_num)
+ continue;
+ if (test_bit(noderef, dlm->exit_domain_map))
+ continue;
+ nodenum = noderef;
+ goto bail;
}
- mlog(0, "giving up. no master to migrate to\n");
- return DLM_LOCK_RES_OWNER_UNKNOWN;
+bail:
+ return nodenum;
}
-
-
/* this is called by the new master once all lockres
* data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
@@ -2977,7 +2965,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
- mlog(0, "migrate_request returned %d!\n", ret);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+ "MIGRATE_REQUEST to node %u\n", dlm->name,
+ migrate.namelen, migrate.name, ret, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
@@ -2996,7 +2986,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
dlm->name, res->lockname.len, res->lockname.name,
nodenum);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(nodenum, res);
+ dlm_lockres_set_refmap_bit(dlm, res, nodenum);
spin_unlock(&res->spinlock);
}
}
@@ -3035,8 +3025,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
hash = dlm_lockid_hash(name, namelen);
/* preallocate.. if this fails, abort */
- mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_NOFS);
+ mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
ret = -ENOMEM;
@@ -3046,8 +3035,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
- spin_lock(&dlm->master_lock);
-
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3065,14 +3052,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&res->spinlock);
}
+ spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
-unlock:
spin_unlock(&dlm->master_lock);
+unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
@@ -3107,8 +3095,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
*oldmle = NULL;
- mlog_entry_void();
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
@@ -3143,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* remove it so that only one mle will be found */
__dlm_unlink_mle(dlm, tmp);
__dlm_mle_detach_hb_events(dlm, tmp);
- ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
- mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
- "telling master to get ref for cleared out mle "
- "during migration\n", dlm->name, namelen, name,
- master, new_master);
+ if (tmp->type == DLM_MLE_MASTER) {
+ ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+ "telling master to get ref "
+ "for cleared out mle during "
+ "migration\n", dlm->name,
+ namelen, name, master,
+ new_master);
+ }
}
spin_unlock(&tmp->spinlock);
}
@@ -3245,10 +3235,10 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_master_list_entry *mle;
struct dlm_lock_resource *res;
struct hlist_head *bucket;
- struct hlist_node *list;
+ struct hlist_node *tmp;
unsigned int i;
- mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+ mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
top:
assert_spin_locked(&dlm->spinlock);
@@ -3256,10 +3246,7 @@ top:
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
- hlist_for_each(list, bucket) {
- mle = hlist_entry(list, struct dlm_master_list_entry,
- master_hash_node);
-
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
BUG_ON(mle->type != DLM_MLE_BLOCK &&
mle->type != DLM_MLE_MASTER &&
mle->type != DLM_MLE_MIGRATION);
@@ -3334,7 +3321,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* mastery reference here since old_master will briefly have
* a reference after the migration completes */
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(old_master, res);
+ dlm_lockres_set_refmap_bit(dlm, res, old_master);
spin_unlock(&res->spinlock);
mlog(0, "now time to do a migrate request to other nodes\n");
@@ -3434,3 +3421,41 @@ void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
wake_up(&res->wq);
wake_up(&dlm->migration_wq);
}
+
+void dlm_force_free_mles(struct dlm_ctxt *dlm)
+{
+ int i;
+ struct hlist_head *bucket;
+ struct dlm_master_list_entry *mle;
+ struct hlist_node *tmp;
+
+ /*
+ * We notified all other nodes that we are exiting the domain and
+ * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
+ * around we force free them and wake any processes that are waiting
+ * on the mles
+ */
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+
+ BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
+ BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
+
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = dlm_master_hash(dlm, i);
+ hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
+ if (mle->type != DLM_MLE_BLOCK) {
+ mlog(ML_ERROR, "bad mle: %p\n", mle);
+ dlm_print_one_mle(mle);
+ }
+ atomic_set(&mle->woken, 1);
+ wake_up(&mle->wq);
+
+ __dlm_unlink_mle(dlm, mle);
+ __dlm_mle_detach_hb_events(dlm, mle);
+ __dlm_put_mle(mle);
+ }
+ }
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf..45067faf569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
static int dlm_do_recovery(struct dlm_ctxt *dlm);
static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -362,40 +359,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
}
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(ML_NOTICE, "%s: waiting %dms for notification of "
- "death of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_dead(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
- "of death of node %u\n", dlm->name, node);
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_dead(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
{
- if (timeout) {
- mlog(0, "%s: waiting %dms for notification of "
- "recovery of node %u\n", dlm->name, timeout, node);
+ if (dlm_is_node_recovered(dlm, node))
+ return;
+
+ printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+ "domain %s\n", node, dlm->name);
+
+ if (timeout)
wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_recovered(dlm, node),
- msecs_to_jiffies(timeout));
- } else {
- mlog(0, "%s: waiting indefinitely for notification "
- "of recovery of node %u\n", dlm->name, node);
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ else
wait_event(dlm->dlm_reco_thread_wq,
dlm_is_node_recovered(dlm, node));
- }
- /* for now, return 0 */
- return 0;
}
/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +425,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+ printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+ dlm->name, dlm->reco.dead_node);
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
}
@@ -440,9 +437,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
spin_unlock(&dlm->spinlock);
+ printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
wake_up(&dlm->reco.event);
}
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+ printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+ "dead node %u in domain %s\n", dlm->reco.new_master,
+ (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+ dlm->reco.dead_node, dlm->name);
+}
+
static int dlm_do_recovery(struct dlm_ctxt *dlm)
{
int status = 0;
@@ -463,7 +469,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
int bit;
- bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+ bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
@@ -505,9 +511,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
- dlm->node_num, dlm->reco.dead_node);
+
+ dlm_print_recovery_master(dlm);
/* it is safe to start everything back up here
* because all of the dead node's lock resources
@@ -518,15 +523,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
- "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
- dlm->node_num, dlm->reco.dead_node, dlm->name);
+ dlm_print_recovery_master(dlm);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
/* we should never hit this anymore */
- mlog(ML_ERROR, "error %d remastering locks for node %u, "
- "retrying.\n", status, dlm->reco.dead_node);
+ mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+ "retrying.\n", dlm->name, status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
* to get handled on remaining nodes */
msleep(100);
@@ -534,7 +537,10 @@ master_here:
/* success! see if any other nodes need recovery */
mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
dlm->name, dlm->reco.dead_node, dlm->node_num);
- dlm_reset_recovery(dlm);
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
}
dlm_end_recovery(dlm);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
- mlog(0, "requesting lock info from node %u\n",
+ mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
ndata->node_num);
if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
spin_unlock(&dlm_reco_state_lock);
}
- mlog(0, "done requesting all lock info\n");
+ mlog(0, "%s: Done requesting all lock info\n", dlm->name);
/* nodes should be sending reco data now
* just need to wait */
@@ -692,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (all_nodes_done) {
int ret;
+ /* Set this flag on recovery master to avoid
+ * a new recovery for another dead node start
+ * before the recovery is not done. That may
+ * cause recovery hung.*/
+ spin_lock(&dlm->spinlock);
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
* just send a finalize message to everyone and
* clean up */
@@ -727,7 +741,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (destroy)
dlm_destroy_recovery_area(dlm, dead_node);
- mlog_exit(status);
return status;
}
@@ -784,7 +797,8 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
u8 dead_node)
{
struct dlm_lock_request lr;
- enum dlm_status ret;
+ int ret;
+ int status;
mlog(0, "\n");
@@ -797,14 +811,16 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
lr.dead_node = dead_node;
// send message
- ret = DLM_NOLOCKMGR;
ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
- &lr, sizeof(lr), request_from, NULL);
+ &lr, sizeof(lr), request_from, &status);
/* negative status is handled by caller */
if (ret < 0)
- mlog_errno(ret);
-
+ mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+ "to recover dead node %u\n", dlm->name, ret,
+ request_from, dead_node);
+ else
+ ret = status;
// return from here, then
// sleep until all received or error
return ret;
@@ -955,10 +971,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
if (ret < 0) {
+ mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+ "to recover dead node %u\n", dlm->name, ret, send_to,
+ dead_node);
if (!dlm_is_host_down(ret)) {
- mlog_errno(ret);
- mlog(ML_ERROR, "%s: unknown error sending data-done "
- "to %u\n", dlm->name, send_to);
BUG();
}
} else
@@ -1126,7 +1142,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
if (ret < 0) {
/* XXX: negative status is not handled.
* this will end up killing this node. */
- mlog_errno(ret);
+ mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+ "node %u (%s)\n", dlm->name, mres->lockname_len,
+ mres->lockname, ret, send_to,
+ (orig_flags & DLM_MRES_MIGRATION ?
+ "migration" : "recovery"));
} else {
/* might get an -ENOMEM back here */
ret = status;
@@ -1398,6 +1418,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
@@ -1488,13 +1509,11 @@ leave:
dlm_put(dlm);
if (ret < 0) {
- if (buf)
- kfree(buf);
- if (item)
- kfree(item);
+ kfree(buf);
+ kfree(item);
+ mlog_errno(ret);
}
- mlog_exit(ret);
return ret;
}
@@ -1563,7 +1582,6 @@ leave:
dlm_lockres_put(res);
}
kfree(data);
- mlog_exit(ret);
}
@@ -1642,7 +1660,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
&req, sizeof(req), nodenum, &status);
/* XXX: negative status not handled properly here. */
if (ret < 0)
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+ dlm->key, nodenum);
else {
BUG_ON(status < 0);
BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1688,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
mlog_errno(-ENOMEM);
/* retry!? */
BUG();
- }
+ } else
+ __dlm_lockres_grab_inflight_worker(dlm, res);
} else /* put.. incase we are not the master */
dlm_lockres_put(res);
spin_unlock(&res->spinlock);
@@ -1741,13 +1762,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
int i, j, bad;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
unsigned int added = 0;
__be64 c;
@@ -1763,7 +1784,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm->name, mres->lockname_len, mres->lockname,
from);
spin_lock(&res->spinlock);
- dlm_lockres_set_refmap_bit(from, res);
+ dlm_lockres_set_refmap_bit(dlm, res, from);
spin_unlock(&res->spinlock);
added++;
break;
@@ -1782,14 +1803,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* MIGRATION ONLY! */
BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+ lock = NULL;
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each_entry(lock, tmpq, list) {
- if (lock->ml.cookie != ml->cookie)
- lock = NULL;
- else
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
+ if (lock->ml.cookie == ml->cookie)
break;
+ lock = NULL;
}
if (lock)
break;
@@ -1877,6 +1900,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
if (ml->type == LKM_NLMODE)
goto skip_lvb;
+ /*
+ * If the lock is in the blocked list it can't have a valid lvb,
+ * so skip it
+ */
+ if (ml->list == DLM_BLOCKED_LIST)
+ goto skip_lvb;
+
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
@@ -1957,11 +1987,19 @@ skip_lvb:
}
if (!bad) {
dlm_lock_get(newlock);
- list_add_tail(&newlock->list, queue);
+ if (mres->flags & DLM_MRES_RECOVERY &&
+ ml->list == DLM_CONVERTING_LIST &&
+ newlock->ml.type >
+ newlock->ml.convert_type) {
+ /* newlock is doing downconvert, add it to the
+ * head of converting list */
+ list_add(&newlock->list, queue);
+ } else
+ list_add_tail(&newlock->list, queue);
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
- dlm_lockres_set_refmap_bit(ml->node, res);
+ dlm_lockres_set_refmap_bit(dlm, res, ml->node);
added++;
}
spin_unlock(&res->spinlock);
@@ -1980,7 +2018,6 @@ leave:
dlm_lock_put(newlock);
}
- mlog_exit(ret);
return ret;
}
@@ -1991,6 +2028,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct list_head *queue;
struct dlm_lock *lock, *next;
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
mlog(0,
@@ -2071,16 +2110,16 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
u8 dead_node, u8 new_master)
{
int i;
- struct hlist_node *hash_iter;
struct hlist_head *bucket;
struct dlm_lock_resource *res, *next;
- mlog_entry_void();
-
assert_spin_locked(&dlm->spinlock);
list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
/* new_master has our reference from
@@ -2101,41 +2140,31 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
* if necessary */
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
- if (res->state & DLM_LOCK_RES_RECOVERING) {
- if (res->owner == dead_node) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, but "
- "clearing state anyway\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else if (res->owner == dlm->node_num) {
- mlog(0, "(this=%u) res %.*s owner=%u "
- "was not on recovering list, "
- "owner is THIS node, clearing\n",
- dlm->node_num, res->lockname.len,
- res->lockname.name, new_master);
- } else
- continue;
+ hlist_for_each_entry(res, bucket, hash_node) {
+ if (!(res->state & DLM_LOCK_RES_RECOVERING))
+ continue;
- if (!list_empty(&res->recovering)) {
- mlog(0, "%s:%.*s: lockres was "
- "marked RECOVERING, owner=%u\n",
- dlm->name, res->lockname.len,
- res->lockname.name, res->owner);
- list_del_init(&res->recovering);
- dlm_lockres_put(res);
- }
- spin_lock(&res->spinlock);
- /* new_master has our reference from
- * the lock state sent during recovery */
- dlm_change_lockres_owner(dlm, res, new_master);
- res->state &= ~DLM_LOCK_RES_RECOVERING;
- if (__dlm_lockres_has_locks(res))
- __dlm_dirty_lockres(dlm, res);
- spin_unlock(&res->spinlock);
- wake_up(&res->wq);
+ if (res->owner != dead_node &&
+ res->owner != dlm->node_num)
+ continue;
+
+ if (!list_empty(&res->recovering)) {
+ list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
+
+ /* new_master has our reference from
+ * the lock state sent during recovery */
+ mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->owner, new_master);
+ spin_lock(&res->spinlock);
+ dlm_change_lockres_owner(dlm, res, new_master);
+ res->state &= ~DLM_LOCK_RES_RECOVERING;
+ if (__dlm_lockres_has_locks(res))
+ __dlm_dirty_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
}
}
}
@@ -2249,12 +2278,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
"no locks and had not purged before dying\n", dlm->name,
res->lockname.len, res->lockname.name, dead_node);
- dlm_lockres_clear_refmap_bit(dead_node, res);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
}
/* do not kick thread yet */
@@ -2270,7 +2299,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct hlist_node *iter;
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
@@ -2296,7 +2324,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, iter, bucket, hash_node) {
+ hlist_for_each_entry(res, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2320,22 +2348,26 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
/* zero the lvb if necessary */
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
- if (res->state & DLM_LOCK_RES_DROPPING_REF)
- mlog(0, "%s:%.*s: owned by "
- "dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s, Skip "
+ "recovery as it is being freed\n",
dlm->name, res->lockname.len,
- res->lockname.name, dead_node);
-
- /* the wake_up for this will happen when the
- * RECOVERING flag is dropped later */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ res->lockname.name);
+ } else
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
- dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
+ } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ if (test_bit(dead_node, res->refmap)) {
+ mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+ "no locks and had not purged before dying\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+ }
}
spin_unlock(&res->spinlock);
}
@@ -2394,6 +2426,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
mlog(0, "node %u being removed from domain map!\n", idx);
clear_bit(idx, dlm->domain_map);
+ clear_bit(idx, dlm->exit_domain_map);
/* wake up migration waiters if a node goes down.
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
@@ -2603,8 +2636,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
int nodenum;
int status;
- mlog_entry("%u\n", dead_node);
-
mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
spin_lock(&dlm->spinlock);
@@ -2640,7 +2671,7 @@ retry:
if (dlm_is_host_down(ret)) {
/* node is down. not involved in recovery
* so just keep going */
- mlog(0, "%s: node %u was down when sending "
+ mlog(ML_NOTICE, "%s: node %u was down when sending "
"begin reco msg (%d)\n", dlm->name, nodenum, ret);
ret = 0;
}
@@ -2660,11 +2691,12 @@ retry:
}
if (ret < 0) {
struct dlm_lock_resource *res;
+
/* this is now a serious problem, possibly ENOMEM
* in the network stack. must retry */
mlog_errno(ret);
mlog(ML_ERROR, "begin reco of dlm %s to node %u "
- " returned %d\n", dlm->name, nodenum, ret);
+ "returned %d\n", dlm->name, nodenum, ret);
res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
DLM_RECOVERY_LOCK_NAME_LEN);
if (res) {
@@ -2700,6 +2732,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
dlm->name, br->node_idx, br->dead_node,
dlm->reco.dead_node, dlm->reco.new_master);
spin_unlock(&dlm->spinlock);
+ dlm_put(dlm);
return -EAGAIN;
}
spin_unlock(&dlm->spinlock);
@@ -2789,7 +2822,9 @@ stage2:
if (ret >= 0)
ret = status;
if (ret < 0) {
- mlog_errno(ret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key "
+ "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+ dlm->key, nodenum);
if (dlm_is_host_down(ret)) {
/* this has no effect on this recovery
* session, so set the status to zero to
@@ -2869,8 +2904,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
BUG();
}
dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
- dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
default:
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78..69aac6f088a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
@@ -93,19 +92,29 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
- if (!__dlm_lockres_has_locks(res) &&
- (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
- /* try not to scan the bitmap unless the first two
- * conditions are already true */
- int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
- if (bit >= O2NM_MAX_NODES) {
- /* since the bit for dlm->node_num is not
- * set, inflight_locks better be zero */
- BUG_ON(res->inflight_locks != 0);
- return 1;
- }
- }
- return 0;
+ int bit;
+
+ assert_spin_locked(&res->spinlock);
+
+ if (__dlm_lockres_has_locks(res))
+ return 0;
+
+ /* Locks are in the process of being created */
+ if (res->inflight_locks)
+ return 0;
+
+ if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
+ return 0;
+
+ if (res->state & DLM_LOCK_RES_RECOVERING)
+ return 0;
+
+ /* Another node has this resource with this node as the master */
+ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+ if (bit < O2NM_MAX_NODES)
+ return 0;
+
+ return 1;
}
@@ -115,15 +124,13 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
if (list_empty(&res->purge)) {
- mlog(0, "putting lockres %.*s:%p onto purge list\n",
- res->lockname.len, res->lockname.name, res);
+ mlog(0, "%s: Adding res %.*s to purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
res->last_used = jiffies;
dlm_lockres_get(res);
@@ -131,8 +138,8 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
dlm->purge_count++;
}
} else if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
- res->lockname.len, res->lockname.name, res, res->owner);
+ mlog(0, "%s: Removing res %.*s from purge list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->purge);
dlm_lockres_put(res);
@@ -143,7 +150,6 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -153,45 +159,24 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
- spin_lock(&res->spinlock);
- if (!__dlm_lockres_unused(res)) {
- mlog(0, "%s:%.*s: tried to purge but not unused\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- spin_unlock(&res->spinlock);
- BUG();
- }
-
- if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "%s:%.*s: Delay dropref as this lockres is "
- "being remastered\n", dlm->name, res->lockname.len,
- res->lockname.name);
- /* Re-add the lockres to the end of the purge list */
- if (!list_empty(&res->purge)) {
- list_del_init(&res->purge);
- list_add_tail(&res->purge, &dlm->purge_list);
- }
- spin_unlock(&res->spinlock);
- return 0;
- }
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
- if (!master)
- res->state |= DLM_LOCK_RES_DROPPING_REF;
- spin_unlock(&res->spinlock);
-
- mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
- res->lockname.name, master);
+ mlog(0, "%s: Purging res %.*s, master %d\n", dlm->name,
+ res->lockname.len, res->lockname.name, master);
if (!master) {
+ res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
+ spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -202,38 +187,38 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
- mlog_errno(ret);
if (!dlm_is_host_down(ret))
BUG();
}
- mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
- dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
+ spin_lock(&res->spinlock);
}
- spin_lock(&res->spinlock);
if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s:%p from purgelist, "
- "master = %d\n", res->lockname.len, res->lockname.name,
- res, master);
+ mlog(0, "%s: Removing res %.*s from purgelist, master %d\n",
+ dlm->name, res->lockname.len, res->lockname.name, master);
list_del_init(&res->purge);
- spin_unlock(&res->spinlock);
dlm_lockres_put(res);
dlm->purge_count--;
- } else
- spin_unlock(&res->spinlock);
+ }
- __dlm_unhash_lockres(res);
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
- spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- }
- return 0;
+ } else
+ spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -252,17 +237,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
- /* Status of the lockres *might* change so double
- * check. If the lockres is unused, holding the dlm
- * spinlock will prevent people from getting and more
- * refs on it -- there's no need to keep the lockres
- * spinlock. */
spin_lock(&lockres->spinlock);
- unused = __dlm_lockres_unused(lockres);
- spin_unlock(&lockres->spinlock);
-
- if (!unused)
- continue;
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -274,15 +249,32 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
+ spin_unlock(&lockres->spinlock);
break;
}
+ /* Status of the lockres *might* change so double
+ * check. If the lockres is unused, holding the dlm
+ * spinlock will prevent people from getting and more
+ * refs on it. */
+ unused = __dlm_lockres_unused(lockres);
+ if (!unused ||
+ (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+ (lockres->inflight_assert_workers != 0)) {
+ mlog(0, "%s: res %.*s is in use or being remastered, "
+ "used %d, state %d, assert master workers %u\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name,
+ !unused, lockres->state,
+ lockres->inflight_assert_workers);
+ list_move_tail(&lockres->purge, &dlm->purge_list);
+ spin_unlock(&lockres->spinlock);
+ continue;
+ }
+
dlm_lockres_get(lockres);
- /* This may drop and reacquire the dlm spinlock if it
- * has to do migration. */
- if (dlm_purge_lockres(dlm, lockres))
- BUG();
+ dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
@@ -297,19 +289,15 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_lock *lock, *target;
- struct list_head *iter;
- struct list_head *head;
int can_grant = 1;
- //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
- //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
- //mlog(0, "shuffle res %.*s\n", res->lockname.len,
- // res->lockname.name);
-
- /* because this function is called with the lockres
+ /*
+ * Because this function is called with the lockres
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
- * basts right before queueing them all throughout */
+ * basts right before queueing them all throughout
+ */
+ assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
DLM_LOCK_RES_RECOVERING|
@@ -318,18 +306,16 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
converting:
if (list_empty(&res->converting))
goto blocked;
- mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
- res->lockname.name);
+ mlog(0, "%s: res %.*s has locks on the convert queue\n", dlm->name,
+ res->lockname.len, res->lockname.name);
target = list_entry(res->converting.next, struct dlm_lock, list);
if (target->ml.convert_type == LKM_IVMODE) {
- mlog(ML_ERROR, "%.*s: converting a lock with no "
- "convert_type!\n", res->lockname.len, res->lockname.name);
+ mlog(ML_ERROR, "%s: res %.*s converting lock to invalid mode\n",
+ dlm->name, res->lockname.len, res->lockname.name);
BUG();
}
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -338,7 +324,7 @@ converting:
/* queue the BAST if not already */
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
/* update the highest_blocked if needed */
if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -346,9 +332,8 @@ converting:
target->ml.convert_type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
@@ -356,7 +341,7 @@ converting:
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
@@ -369,9 +354,12 @@ converting:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for converting lock: %.*s, have: %d, "
- "granting: %d, node: %u\n", res->lockname.len,
- res->lockname.name, target->ml.type,
+ mlog(0, "%s: res %.*s, AST for Converting lock %u:%llu, type "
+ "%d => %d, node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
+ target->ml.type,
target->ml.convert_type, target->ml.node);
target->ml.type = target->ml.convert_type;
@@ -384,7 +372,7 @@ converting:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -394,32 +382,28 @@ blocked:
goto leave;
target = list_entry(res->blocked.next, struct dlm_lock, list);
- head = &res->granted;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->granted, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
- head = &res->converting;
- list_for_each(iter, head) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, &res->converting, list) {
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
- dlm_queue_bast(dlm, lock);
+ __dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
@@ -432,11 +416,14 @@ blocked:
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
- mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
- "node: %u\n", res->lockname.len, res->lockname.name,
+ mlog(0, "%s: res %.*s, AST for Blocked lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(target->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(target->ml.cookie)),
target->ml.type, target->ml.node);
- // target->ml.type is already correct
+ /* target->ml.type is already correct */
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
@@ -445,7 +432,7 @@ blocked:
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
- dlm_queue_ast(dlm, target);
+ __dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
@@ -457,7 +444,6 @@ leave:
/* must have NO locks when calling this with res !=NULL * */
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
if (res) {
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
@@ -470,8 +456,6 @@ void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
- mlog_entry("dlm=%p, res=%p\n", dlm, res);
-
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
@@ -488,13 +472,16 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
res->state |= DLM_LOCK_RES_DIRTY;
}
}
+
+ mlog(0, "%s: res %.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
}
/* Launch the NM thread for the mounted volume */
int dlm_launch_thread(struct dlm_ctxt *dlm)
{
- mlog(0, "starting dlm thread...\n");
+ mlog(0, "Starting dlm_thread...\n");
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
if (IS_ERR(dlm->dlm_thread_task)) {
@@ -509,7 +496,7 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
void dlm_complete_thread(struct dlm_ctxt *dlm)
{
if (dlm->dlm_thread_task) {
- mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
+ mlog(ML_KTHREAD, "Waiting for dlm thread to exit\n");
kthread_stop(dlm->dlm_thread_task);
dlm->dlm_thread_task = NULL;
}
@@ -540,7 +527,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
- mlog(0, "delivering an ast for this lockres\n");
+ mlog(0, "%s: res %.*s, Flush AST for lock %u:%llu, type %d, "
+ "node %u\n", dlm->name, res->lockname.len,
+ res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.node);
BUG_ON(!lock->ast_pending);
@@ -561,9 +553,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another ast was queued while
* we were delivering the last one */
if (!list_empty(&lock->ast_list)) {
- mlog(0, "aha another ast got queued while "
- "we were finishing the last one. will "
- "keep the ast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, AST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->ast_pending = 0;
@@ -594,8 +586,12 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
- mlog(0, "delivering a bast for this lockres "
- "(blocked = %d\n", hi);
+ mlog(0, "%s: res %.*s, Flush BAST for lock %u:%llu, "
+ "blocked %d, node %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ hi, lock->ml.node);
if (lock->ml.node != dlm->node_num) {
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
@@ -609,9 +605,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
/* possible that another bast was queued while
* we were delivering the last one */
if (!list_empty(&lock->bast_list)) {
- mlog(0, "aha another bast got queued while "
- "we were finishing the last one. will "
- "keep the bast_pending flag set.\n");
+ mlog(0, "%s: res %.*s, BAST queued while flushing last "
+ "one\n", dlm->name, res->lockname.len,
+ res->lockname.name);
} else
lock->bast_pending = 0;
@@ -675,14 +671,16 @@ static int dlm_thread(void *data)
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
+ spin_lock(&dlm->ast_lock);
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
- mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
- res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
- res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
- res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
- res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
+ mlog(ML_ERROR, "%s: inprog %d, mig %d, reco %d,"
+ " dirty %d\n", dlm->name,
+ !!(res->state & DLM_LOCK_RES_IN_PROGRESS),
+ !!(res->state & DLM_LOCK_RES_MIGRATING),
+ !!(res->state & DLM_LOCK_RES_RECOVERING),
+ !!(res->state & DLM_LOCK_RES_DIRTY));
}
BUG_ON(res->owner != dlm->node_num);
@@ -695,8 +693,9 @@ static int dlm_thread(void *data)
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
- mlog(0, "delaying list shuffling for in-"
- "progress lockres %.*s, state=%d\n",
+ spin_unlock(&dlm->ast_lock);
+ mlog(0, "%s: res %.*s, inprogress, delay list "
+ "shuffle, state %d\n", dlm->name,
res->lockname.len, res->lockname.name,
res->state);
delay = 1;
@@ -708,14 +707,11 @@ static int dlm_thread(void *data)
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
- mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
- "res=%.*s\n", dlm->name,
- res->lockname.len, res->lockname.name);
-
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->ast_lock);
dlm_lockres_calc_usage(dlm, res);
@@ -734,7 +730,8 @@ in_progress:
/* unlikely, but we may need to give time to
* other tasks */
if (!--n) {
- mlog(0, "throttling dlm_thread\n");
+ mlog(0, "%s: Throttling dlm thread\n",
+ dlm->name);
break;
}
}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd020..2e3c9dbab68 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
@@ -192,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR
+ ) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
@@ -201,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
- "forward"));
+ (status == DLM_FORWARD ? "forward" :
+ "nolockmanager")));
actions = 0;
}
if (flags & LKM_CANCEL)
@@ -318,7 +320,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct kvec vec[2];
size_t veclen = 1;
- mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
if (owner == dlm->node_num) {
/* ended up trying to contact ourself. this means
@@ -355,7 +357,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
mlog(0, "master was in-progress. retry\n");
ret = status;
} else {
- mlog_errno(tmpret);
+ mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+ "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
if (dlm_is_host_down(tmpret)) {
/* NOTE: this seems strange, but it is what we want.
* when the master goes down during a cancel or
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
- ret = DLM_NORMAL;
+ if (dlm_is_node_dead(dlm, owner))
+ ret = DLM_NORMAL;
+ else
+ ret = DLM_NOLOCKMGR;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
@@ -388,7 +394,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
enum dlm_status status = DLM_NORMAL;
int found = 0, i;
@@ -458,8 +463,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, queue, list) {
if (lock->ml.cookie == unlock->cookie &&
lock->ml.node == unlock->node_idx) {
dlm_lock_get(lock);
@@ -588,8 +592,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
struct dlm_lock *lock = NULL;
int call_ast, is_master;
- mlog_entry_void();
-
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
@@ -642,7 +644,9 @@ retry:
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR) {
+
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
@@ -654,7 +658,7 @@ retry:
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
- "migration/in-progress\n");
+ "migration/in-progress/reconnect\n");
goto retry;
}
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158..00000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a1..00000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index df69b4856d0..eed3db8c5b4 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,5 +1,5 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1b0de157a08..09b7d9dac71 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
#include "stackglue.h"
#include "userdlm.h"
-#include "dlmfsver.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
@@ -88,7 +87,7 @@ struct workqueue_struct *user_dlm_worker;
* signifies a bast fired on the lock.
*/
#define DLMFS_CAPABILITIES "bast stackglue"
-extern int param_set_dlmfs_capabilities(const char *val,
+static int param_set_dlmfs_capabilities(const char *val,
struct kernel_param *kp)
{
printk(KERN_ERR "%s: readonly parameter\n", kp->name);
@@ -112,20 +111,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
* O_RDONLY -> PRMODE level
* O_WRONLY -> EXMODE level
*
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
*/
static int dlmfs_decode_open_flags(int open_flags,
int *level,
int *flags)
{
if (open_flags & (O_WRONLY|O_RDWR))
- *level = LKM_EXMODE;
+ *level = DLM_LOCK_EX;
else
- *level = LKM_PRMODE;
+ *level = DLM_LOCK_PR;
*flags = 0;
if (open_flags & O_NONBLOCK)
- *flags |= LKM_NOQUEUE;
+ *flags |= DLM_LKF_NOQUEUE;
return 0;
}
@@ -166,7 +165,7 @@ static int dlmfs_file_open(struct inode *inode,
* to be able userspace to be able to distinguish a
* valid lock request from one that simply couldn't be
* granted. */
- if (flags & LKM_NOQUEUE && status == -EAGAIN)
+ if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
status = -ETXTBSY;
kfree(fp);
goto bail;
@@ -182,8 +181,7 @@ static int dlmfs_file_release(struct inode *inode,
{
int level, status;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
- struct dlmfs_filp_private *fp =
- (struct dlmfs_filp_private *) file->private_data;
+ struct dlmfs_filp_private *fp = file->private_data;
if (S_ISDIR(inode->i_mode))
BUG();
@@ -193,7 +191,7 @@ static int dlmfs_file_release(struct inode *inode,
status = 0;
if (fp) {
level = fp->fp_lock_level;
- if (level != LKM_IVMODE)
+ if (level != DLM_LOCK_IV)
user_dlm_cluster_unlock(&ip->ip_lockres, level);
kfree(fp);
@@ -214,16 +212,18 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_SIZE;
error = inode_change_ok(inode, attr);
- if (!error)
- error = inode_setattr(inode, attr);
+ if (error)
+ return error;
- return error;
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
}
static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
{
int event = 0;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct dlmfs_inode_private *ip = DLMFS_I(inode);
poll_wait(file, &ip->ip_lockres.l_event, wait);
@@ -244,7 +244,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
int bytes_left;
ssize_t readlen, got;
char *lvb_buf;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
if ((count + *ppos) > i_size_read(inode))
readlen = i_size_read(inode) - *ppos;
else
- readlen = count - *ppos;
+ readlen = count;
lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
@@ -292,7 +292,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
int bytes_left;
ssize_t writelen;
char *lvb_buf;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
@@ -350,18 +350,23 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
return &ip->ip_vfs_inode;
}
-static void dlmfs_destroy_inode(struct inode *inode)
+static void dlmfs_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
-static void dlmfs_clear_inode(struct inode *inode)
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, dlmfs_i_callback);
+}
+
+static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
- if (!inode)
- return;
+ clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
@@ -394,15 +399,11 @@ static struct backing_dev_info dlmfs_backing_dev_info = {
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
- int mode = S_IFDIR | 0755;
- struct dlmfs_inode_private *ip;
+ umode_t mode = S_IFDIR | 0755;
if (inode) {
- ip = DLMFS_I(inode);
-
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, NULL, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -416,7 +417,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
static struct inode *dlmfs_get_inode(struct inode *parent,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
struct super_block *sb = parent->i_sb;
struct inode * inode = new_inode(sb);
@@ -425,9 +426,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
if (!inode)
return NULL;
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, parent, mode);
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -464,13 +464,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inc_nlink(inode);
break;
}
-
- if (parent->i_mode & S_ISGID) {
- inode->i_gid = parent->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
-
return inode;
}
@@ -480,7 +473,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
/* SMP-safe */
static int dlmfs_mkdir(struct inode * dir,
struct dentry * dentry,
- int mode)
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
@@ -528,8 +521,8 @@ bail:
static int dlmfs_create(struct inode *dir,
struct dentry *dentry,
- int mode,
- struct nameidata *nd)
+ umode_t mode,
+ bool excl)
{
int status = 0;
struct inode *inode;
@@ -585,24 +578,14 @@ static int dlmfs_fill_super(struct super_block * sb,
void * data,
int silent)
{
- struct inode * inode;
- struct dentry * root;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
- inode = dlmfs_get_root_inode(sb);
- if (!inode)
+ sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
+ if (!sb->s_root)
return -ENOMEM;
-
- root = d_alloc_root(inode);
- if (!root) {
- iput(inode);
- return -ENOMEM;
- }
- sb->s_root = root;
return 0;
}
@@ -612,6 +595,7 @@ static const struct file_operations dlmfs_file_operations = {
.poll = dlmfs_file_poll,
.read = dlmfs_file_read,
.write = dlmfs_file_write,
+ .llseek = default_llseek,
};
static const struct inode_operations dlmfs_dir_inode_operations = {
@@ -631,7 +615,7 @@ static const struct super_operations dlmfs_ops = {
.statfs = simple_statfs,
.alloc_inode = dlmfs_alloc_inode,
.destroy_inode = dlmfs_destroy_inode,
- .clear_inode = dlmfs_clear_inode,
+ .evict_inode = dlmfs_evict_inode,
.drop_inode = generic_delete_inode,
};
@@ -640,26 +624,25 @@ static const struct inode_operations dlmfs_file_inode_operations = {
.setattr = dlmfs_file_setattr,
};
-static int dlmfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
}
static struct file_system_type dlmfs_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2_dlmfs",
- .get_sb = dlmfs_get_sb,
+ .mount = dlmfs_mount,
.kill_sb = kill_litter_super,
};
+MODULE_ALIAS_FS("ocfs2_dlmfs");
static int __init init_dlmfs_fs(void)
{
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- dlmfs_print_version();
-
status = bdi_init(&dlmfs_backing_dev_info);
if (status)
return status;
@@ -703,6 +686,11 @@ static void __exit exit_dlmfs_fs(void)
flush_workqueue(user_dlm_worker);
destroy_workqueue(user_dlm_worker);
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
bdi_destroy(&dlmfs_backing_dev_info);
@@ -710,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f8..00000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25..00000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da..52cfe99ae05 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -64,7 +64,7 @@ struct ocfs2_mask_waiter {
unsigned long mw_mask;
unsigned long mw_goal;
#ifdef CONFIG_OCFS2_FS_STATS
- unsigned long long mw_lock_start;
+ ktime_t mw_lock_start;
#endif
};
@@ -397,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
{
int len;
- mlog_entry_void();
-
BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -408,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
mlog(0, "built lock resource with name: %s\n", name);
-
- mlog_exit_void();
}
static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -435,44 +431,41 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
#ifdef CONFIG_OCFS2_FS_STATS
static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
{
- res->l_lock_num_prmode = 0;
- res->l_lock_num_prmode_failed = 0;
- res->l_lock_total_prmode = 0;
- res->l_lock_max_prmode = 0;
- res->l_lock_num_exmode = 0;
- res->l_lock_num_exmode_failed = 0;
- res->l_lock_total_exmode = 0;
- res->l_lock_max_exmode = 0;
res->l_lock_refresh = 0;
+ memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+ memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
}
static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
struct ocfs2_mask_waiter *mw, int ret)
{
- unsigned long long *num, *sum;
- unsigned int *max, *failed;
- struct timespec ts = current_kernel_time();
- unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
-
- if (level == LKM_PRMODE) {
- num = &res->l_lock_num_prmode;
- sum = &res->l_lock_total_prmode;
- max = &res->l_lock_max_prmode;
- failed = &res->l_lock_num_prmode_failed;
- } else if (level == LKM_EXMODE) {
- num = &res->l_lock_num_exmode;
- sum = &res->l_lock_total_exmode;
- max = &res->l_lock_max_exmode;
- failed = &res->l_lock_num_exmode_failed;
- } else
+ u32 usec;
+ ktime_t kt;
+ struct ocfs2_lock_stats *stats;
+
+ if (level == LKM_PRMODE)
+ stats = &res->l_lock_prmode;
+ else if (level == LKM_EXMODE)
+ stats = &res->l_lock_exmode;
+ else
return;
- (*num)++;
- (*sum) += time;
- if (time > *max)
- *max = time;
+ kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+ usec = ktime_to_us(kt);
+
+ stats->ls_gets++;
+ stats->ls_total += ktime_to_ns(kt);
+ /* overflow */
+ if (unlikely(stats->ls_gets == 0)) {
+ stats->ls_gets++;
+ stats->ls_total = ktime_to_ns(kt);
+ }
+
+ if (stats->ls_max < usec)
+ stats->ls_max = usec;
+
if (ret)
- (*failed)++;
+ stats->ls_fail++;
}
static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -482,8 +475,7 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
{
- struct timespec ts = current_kernel_time();
- mw->mw_lock_start = timespec_to_ns(&ts);
+ mw->mw_lock_start = ktime_get();
}
#else
static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
@@ -729,8 +721,6 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
- mlog_entry_void();
-
if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
return;
@@ -756,14 +746,11 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
memset(&res->l_lksb, 0, sizeof(res->l_lksb));
res->l_flags = 0UL;
- mlog_exit_void();
}
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
- mlog_entry_void();
-
BUG_ON(!lockres);
switch(level) {
@@ -776,15 +763,11 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
default:
BUG();
}
-
- mlog_exit_void();
}
static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
int level)
{
- mlog_entry_void();
-
BUG_ON(!lockres);
switch(level) {
@@ -799,7 +782,6 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
default:
BUG();
}
- mlog_exit_void();
}
/* WARNING: This function lives in a world where the only three lock
@@ -846,8 +828,6 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
@@ -860,14 +840,10 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
}
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
@@ -889,14 +865,10 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
-
BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -908,15 +880,12 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
lockres->l_level = lockres->l_requested;
lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
- mlog_exit_void();
}
static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
int level)
{
int needs_downconvert = 0;
- mlog_entry_void();
assert_spin_locked(&lockres->l_lock);
@@ -938,8 +907,7 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
if (needs_downconvert)
lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-
- mlog_exit(needs_downconvert);
+ mlog(0, "needs_downconvert = %d\n", needs_downconvert);
return needs_downconvert;
}
@@ -1151,8 +1119,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
unsigned long flags;
- mlog_entry_void();
-
mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
lockres->l_name, lockres->l_unlock_action);
@@ -1162,7 +1128,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
"unlock_action %d\n", error, lockres->l_name,
lockres->l_unlock_action);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- mlog_exit_void();
return;
}
@@ -1186,8 +1151,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
wake_up(&lockres->l_event);
spin_unlock_irqrestore(&lockres->l_lock, flags);
-
- mlog_exit_void();
}
/*
@@ -1233,7 +1196,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
@@ -1244,7 +1206,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
spin_unlock_irqrestore(&lockres->l_lock, flags);
wake_up(&lockres->l_event);
- mlog_exit_void();
}
/* Note: If we detect another process working on the lock (i.e.,
@@ -1260,8 +1221,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
unsigned long flags;
unsigned int gen;
- mlog_entry_void();
-
mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
dlm_flags);
@@ -1293,7 +1252,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
bail:
- mlog_exit(ret);
return ret;
}
@@ -1346,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
{
wait_for_completion(&mw->mw_complete);
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return mw->mw_status;
}
@@ -1397,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
else
ret = mw->mw_status;
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return ret;
}
@@ -1416,8 +1374,6 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned int gen;
int noqueue_attempted = 0;
- mlog_entry_void();
-
ocfs2_init_mask_waiter(&mw);
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -1583,7 +1539,6 @@ out:
caller_ip);
}
#endif
- mlog_exit(ret);
return ret;
}
@@ -1605,7 +1560,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
ocfs2_dec_holders(lockres, level);
ocfs2_downconvert_on_unlock(osb, lockres);
@@ -1614,7 +1568,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
if (lockres->l_lockdep_map.key != NULL)
rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
#endif
- mlog_exit_void();
}
static int ocfs2_create_new_lock(struct ocfs2_super *osb,
@@ -1648,8 +1601,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
BUG_ON(!inode);
BUG_ON(!ocfs2_inode_is_new(inode));
- mlog_entry_void();
-
mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
/* NOTE: That we don't increment any of the holder counts, nor
@@ -1683,7 +1634,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
}
bail:
- mlog_exit(ret);
return ret;
}
@@ -1695,16 +1645,12 @@ int ocfs2_rw_lock(struct inode *inode, int write)
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu take %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
- if (ocfs2_mount_local(osb)) {
- mlog_exit(0);
+ if (ocfs2_mount_local(osb))
return 0;
- }
lockres = &OCFS2_I(inode)->ip_rw_lockres;
@@ -1715,7 +1661,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
if (status < 0)
mlog_errno(status);
- mlog_exit(status);
return status;
}
@@ -1725,16 +1670,12 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
mlog(0, "inode %llu drop %s RW lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
- mlog_exit_void();
}
/*
@@ -1748,12 +1689,10 @@ int ocfs2_open_lock(struct inode *inode)
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu take PRMODE open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (ocfs2_mount_local(osb))
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
goto out;
lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1764,7 +1703,6 @@ int ocfs2_open_lock(struct inode *inode)
mlog_errno(status);
out:
- mlog_exit(status);
return status;
}
@@ -1776,12 +1714,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu try to take %s open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
write ? "EXMODE" : "PRMODE");
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (write)
+ status = -EROFS;
+ goto out;
+ }
+
if (ocfs2_mount_local(osb))
goto out;
@@ -1799,7 +1741,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
level, DLM_LKF_NOQUEUE, 0);
out:
- mlog_exit(status);
return status;
}
@@ -1811,8 +1752,6 @@ void ocfs2_open_unlock(struct inode *inode)
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
mlog(0, "inode %llu drop open lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1827,7 +1766,7 @@ void ocfs2_open_unlock(struct inode *inode)
DLM_LOCK_EX);
out:
- mlog_exit_void();
+ return;
}
static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
@@ -2043,8 +1982,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
{
int kick = 0;
- mlog_entry_void();
-
/* If we know that another node is waiting on our lock, kick
* the downconvert thread * pre-emptively when we reach a release
* condition. */
@@ -2065,8 +2002,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
if (kick)
ocfs2_wake_downconvert_thread(osb);
-
- mlog_exit_void();
}
#define OCFS2_SEC_BITS 34
@@ -2095,8 +2030,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- mlog_entry_void();
-
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/*
@@ -2112,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_version = OCFS2_LVB_VERSION;
lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
- lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
- lvb->lvb_igid = cpu_to_be32(inode->i_gid);
+ lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode));
+ lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
lvb->lvb_iatime_packed =
@@ -2128,8 +2061,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
out:
mlog_meta_lvb(0, lockres);
-
- mlog_exit_void();
}
static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -2145,8 +2076,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- mlog_entry_void();
-
mlog_meta_lvb(0, lockres);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2166,10 +2095,10 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
else
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
- inode->i_gid = be32_to_cpu(lvb->lvb_igid);
+ i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
+ i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
- inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
+ set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
ocfs2_unpack_timespec(&inode->i_atime,
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
@@ -2177,8 +2106,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
spin_unlock(&oi->ip_lock);
-
- mlog_exit_void();
}
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2205,8 +2132,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
unsigned long flags;
int status = 0;
- mlog_entry_void();
-
refresh_check:
spin_lock_irqsave(&lockres->l_lock, flags);
if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -2227,7 +2152,7 @@ refresh_check:
status = 1;
bail:
- mlog_exit(status);
+ mlog(0, "status %d\n", status);
return status;
}
@@ -2237,7 +2162,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
int status)
{
unsigned long flags;
- mlog_entry_void();
spin_lock_irqsave(&lockres->l_lock, flags);
lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -2246,8 +2170,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
spin_unlock_irqrestore(&lockres->l_lock, flags);
wake_up(&lockres->l_event);
-
- mlog_exit_void();
}
/* may or may not return a bh if it went to disk. */
@@ -2260,8 +2182,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
struct ocfs2_dinode *fe;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
if (ocfs2_mount_local(osb))
goto bail;
@@ -2330,7 +2250,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
bail_refresh:
ocfs2_complete_lock_res_refresh(lockres, status);
bail:
- mlog_exit(status);
return status;
}
@@ -2374,8 +2293,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
BUG_ON(!inode);
- mlog_entry_void();
-
mlog(0, "inode %llu, take %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -2387,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
status = -EROFS;
- goto bail;
+ goto getbh;
}
if (ocfs2_mount_local(osb))
@@ -2405,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
arg_flags, subclass, _RET_IP_);
if (status < 0) {
- if (status != -EAGAIN && status != -EIOCBRETRY)
+ if (status != -EAGAIN)
mlog_errno(status);
goto bail;
}
@@ -2445,7 +2362,7 @@ local:
mlog_errno(status);
goto bail;
}
-
+getbh:
if (ret_bh) {
status = ocfs2_assign_bh(inode, ret_bh, local_bh);
if (status < 0) {
@@ -2467,7 +2384,6 @@ bail:
if (local_bh)
brelse(local_bh);
- mlog_exit(status);
return status;
}
@@ -2517,7 +2433,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
{
int ret;
- mlog_entry_void();
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
@@ -2545,7 +2460,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
} else
*level = 0;
- mlog_exit(ret);
return ret;
}
@@ -2556,8 +2470,6 @@ void ocfs2_inode_unlock(struct inode *inode,
struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
mlog(0, "inode %llu drop %s META lock\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
ex ? "EXMODE" : "PRMODE");
@@ -2565,8 +2477,6 @@ void ocfs2_inode_unlock(struct inode *inode,
if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
- mlog_exit_void();
}
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2617,8 +2527,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
@@ -2636,21 +2544,18 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* refreshed, so we do it here. Of course, making sense of
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
if (status) {
status = ocfs2_refresh_slot_info(osb);
ocfs2_complete_lock_res_refresh(lockres, status);
- if (status < 0)
+ if (status < 0) {
+ ocfs2_cluster_unlock(osb, lockres, level);
mlog_errno(status);
+ }
ocfs2_track_lock_refresh(lockres);
}
bail:
- mlog_exit(status);
return status;
}
@@ -2727,8 +2632,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
BUG_ON(!dl);
- if (ocfs2_is_hard_readonly(osb))
- return -EROFS;
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ return -EROFS;
+ return 0;
+ }
if (ocfs2_mount_local(osb))
return 0;
@@ -2746,7 +2654,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (!ocfs2_mount_local(osb))
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
}
@@ -2869,8 +2777,15 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
return iter;
}
-/* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 2
+/*
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ * - Lock stats printed
+ * New in version 3
+ * - Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
@@ -2912,18 +2827,18 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
seq_printf(m, "0x%x\t", lvb[i]);
#ifdef CONFIG_OCFS2_FS_STATS
-# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
-# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
-# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
-# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
-# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
-# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
-# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
-# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
-# define lock_refresh(_l) (_l)->l_lock_refresh
+# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets)
+# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets)
+# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail)
+# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail)
+# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total)
+# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total)
+# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
+# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
+# define lock_refresh(_l) ((_l)->l_lock_refresh)
#else
-# define lock_num_prmode(_l) (0ULL)
-# define lock_num_exmode(_l) (0ULL)
+# define lock_num_prmode(_l) (0)
+# define lock_num_exmode(_l) (0)
# define lock_num_prmode_failed(_l) (0)
# define lock_num_exmode_failed(_l) (0)
# define lock_total_prmode(_l) (0ULL)
@@ -2933,8 +2848,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
# define lock_refresh(_l) (0)
#endif
/* The following seq_print was added in version 2 of this output */
- seq_printf(m, "%llu\t"
- "%llu\t"
+ seq_printf(m, "%u\t"
+ "%u\t"
"%u\t"
"%u\t"
"%llu\t"
@@ -2966,7 +2881,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = {
static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
{
- struct seq_file *seq = (struct seq_file *) file->private_data;
+ struct seq_file *seq = file->private_data;
struct ocfs2_dlm_seq_priv *priv = seq->private;
struct ocfs2_lock_res *res = &priv->p_iter_res;
@@ -3000,7 +2915,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
goto out;
}
- seq = (struct seq_file *) file->private_data;
+ seq = file->private_data;
seq->private = priv;
ocfs2_add_lockres_tracking(&priv->p_iter_res,
@@ -3054,8 +2969,6 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
int status = 0;
struct ocfs2_cluster_connection *conn = NULL;
- mlog_entry_void();
-
if (ocfs2_mount_local(osb)) {
osb->node_num = 0;
goto local;
@@ -3078,6 +2991,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
/* for now, uuid == domain */
status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
osb->uuid_str,
strlen(osb->uuid_str),
&lproto, ocfs2_do_node_down, osb,
@@ -3087,7 +3002,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_cluster_this_node(&osb->node_num);
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
if (status < 0) {
mlog_errno(status);
mlog(ML_ERROR,
@@ -3112,15 +3027,12 @@ bail:
kthread_stop(osb->dc_task);
}
- mlog_exit(status);
return status;
}
void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
int hangup_pending)
{
- mlog_entry_void();
-
ocfs2_drop_osb_locks(osb);
/*
@@ -3143,8 +3055,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
osb->cconn = NULL;
ocfs2_dlm_shutdown_debug(osb);
-
- mlog_exit_void();
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3226,26 +3136,63 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
ocfs2_wait_on_busy_lock(lockres);
out:
- mlog_exit(0);
return 0;
}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
* being dequeued from the downconvert thread before we can consider
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_mask_waiter mw;
- unsigned long flags;
+ unsigned long flags, flags2;
ocfs2_init_mask_waiter(&mw);
spin_lock_irqsave(&lockres->l_lock, flags);
lockres->l_flags |= OCFS2_LOCK_FREEING;
+ if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+ /*
+ * We know the downconvert is queued but not in progress
+ * because we are the downconvert thread and processing
+ * different lock. So we can just remove the lock from the
+ * queue. This is not only an optimization but also a way
+ * to avoid the following deadlock:
+ * ocfs2_dentry_post_unlock()
+ * ocfs2_dentry_lock_put()
+ * ocfs2_drop_dentry_lock()
+ * iput()
+ * ocfs2_evict_inode()
+ * ocfs2_clear_inode()
+ * ocfs2_mark_lockres_freeing()
+ * ... blocks waiting for OCFS2_LOCK_QUEUED
+ * since we are the downconvert thread which
+ * should clear the flag.
+ */
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ spin_lock_irqsave(&osb->dc_task_lock, flags2);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+ /*
+ * Warn if we recurse into another post_unlock call. Strictly
+ * speaking it isn't a problem but we need to be careful if
+ * that happens (stack overflow, deadlocks, ...) so warn if
+ * ocfs2 grows a path for which this can happen.
+ */
+ WARN_ON_ONCE(lockres->l_ops->post_unlock);
+ /* Since the lock is freeing we don't do much in the fn below */
+ ocfs2_process_blocked_lock(osb, lockres);
+ return;
+ }
while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3266,7 +3213,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
{
int ret;
- ocfs2_mark_lockres_freeing(lockres);
+ ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
@@ -3284,8 +3231,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
{
int status, err;
- mlog_entry_void();
-
/* No need to call ocfs2_mark_lockres_freeing here -
* ocfs2_clear_inode has done it for us. */
@@ -3310,7 +3255,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
if (err < 0 && !status)
status = err;
- mlog_exit(status);
return status;
}
@@ -3352,8 +3296,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
int ret;
u32 dlm_flags = DLM_LKF_CONVERT;
- mlog_entry_void();
-
mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
lockres->l_level, new_level);
@@ -3375,7 +3317,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
ret = 0;
bail:
- mlog_exit(ret);
return ret;
}
@@ -3385,8 +3326,6 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
{
assert_spin_locked(&lockres->l_lock);
- mlog_entry_void();
-
if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
/* If we're already trying to cancel a lock conversion
* then just drop the spinlock and allow the caller to
@@ -3416,8 +3355,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
{
int ret;
- mlog_entry_void();
-
ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
DLM_LKF_CANCEL);
if (ret) {
@@ -3427,7 +3364,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
- mlog_exit(ret);
return ret;
}
@@ -3443,8 +3379,6 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
int set_lvb = 0;
unsigned int gen;
- mlog_entry_void();
-
spin_lock_irqsave(&lockres->l_lock, flags);
recheck:
@@ -3619,14 +3553,14 @@ downconvert:
gen);
leave:
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
leave_requeue:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ctl->requeue = 1;
- mlog_exit(0);
return 0;
}
@@ -3635,10 +3569,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
{
struct inode *inode;
struct address_space *mapping;
+ struct ocfs2_inode_info *oi;
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
+ if (S_ISDIR(inode->i_mode)) {
+ oi = OCFS2_I(inode);
+ oi->ip_dir_lock_gen++;
+ mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+ goto out;
+ }
+
if (!S_ISREG(inode->i_mode))
goto out;
@@ -3851,8 +3793,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
oinfo->dqi_gi.dqi_type);
- mlog_entry_void();
-
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
@@ -3861,8 +3801,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
-
- mlog_exit_void();
}
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
@@ -3871,10 +3809,8 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
- mlog_entry_void();
if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres, level);
- mlog_exit_void();
}
static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
@@ -3897,7 +3833,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
oinfo->dqi_gi.dqi_free_entry =
be32_to_cpu(lvb->lvb_free_entry);
} else {
- status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+ status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+ oinfo->dqi_giblk, &bh);
if (status) {
mlog_errno(status);
goto bail;
@@ -3928,8 +3865,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
int status = 0;
- mlog_entry_void();
-
/* On RO devices, locking really isn't needed... */
if (ocfs2_is_hard_readonly(osb)) {
if (ex)
@@ -3952,7 +3887,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
ocfs2_qinfo_unlock(oinfo, ex);
ocfs2_complete_lock_res_refresh(lockres, status);
bail:
- mlog_exit(status);
return status;
}
@@ -3998,8 +3932,6 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
* considered valid until we remove the OCFS2_LOCK_QUEUED
* flag. */
- mlog_entry_void();
-
BUG_ON(!lockres);
BUG_ON(!lockres->l_ops);
@@ -4033,14 +3965,12 @@ unqueue:
if (ctl.unblock_action != UNBLOCK_CONTINUE
&& lockres->l_ops->post_unlock)
lockres->l_ops->post_unlock(osb, lockres);
-
- mlog_exit_void();
}
static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres)
{
- mlog_entry_void();
+ unsigned long flags;
assert_spin_locked(&lockres->l_lock);
@@ -4055,25 +3985,22 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (list_empty(&lockres->l_blocked_list)) {
list_add_tail(&lockres->l_blocked_list,
&osb->blocked_lock_list);
osb->blocked_lock_count++;
}
- spin_unlock(&osb->dc_task_lock);
-
- mlog_exit_void();
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
}
static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
{
unsigned long processed;
+ unsigned long flags;
struct ocfs2_lock_res *lockres;
- mlog_entry_void();
-
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
/* grab this early so we know to try again if a state change and
* wake happens part-way through our work */
osb->dc_work_sequence = osb->dc_wake_sequence;
@@ -4086,40 +4013,40 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
osb->blocked_lock_count--;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
BUG_ON(!processed);
processed--;
ocfs2_process_blocked_lock(osb, lockres);
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
}
- spin_unlock(&osb->dc_task_lock);
-
- mlog_exit_void();
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
}
static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
{
int empty = 0;
+ unsigned long flags;
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (list_empty(&osb->blocked_lock_list))
empty = 1;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
return empty;
}
static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
{
int should_wake = 0;
+ unsigned long flags;
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
if (osb->dc_work_sequence != osb->dc_wake_sequence)
should_wake = 1;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
return should_wake;
}
@@ -4149,10 +4076,12 @@ static int ocfs2_downconvert_thread(void *arg)
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
{
- spin_lock(&osb->dc_task_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
/* make sure the voting thread gets a swipe at whatever changes
* the caller may have made to the voting state */
osb->dc_wake_sequence++;
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
wake_up(&osb->dc_event);
}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d1ce48e1b3d..d293a22c32c 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -84,6 +84,7 @@ enum {
OI_LS_PARENT,
OI_LS_RENAME1,
OI_LS_RENAME2,
+ OI_LS_REFLINK_TARGET,
};
int ocfs2_dlm_init(struct ocfs2_super *osb);
@@ -156,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 19ad145d2af..29651167190 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_EXPORT
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -40,6 +39,7 @@
#include "buffer_head_io.h"
#include "suballoc.h"
+#include "ocfs2_trace.h"
struct ocfs2_inode_handle
{
@@ -56,10 +56,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
int status, set;
struct dentry *result;
- mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+ trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
if (blkno == 0) {
- mlog(0, "nfs wants inode with blkno: 0\n");
result = ERR_PTR(-ESTALE);
goto bail;
}
@@ -83,6 +82,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
}
status = ocfs2_test_inode_bit(osb, blkno, &set);
+ trace_ocfs2_get_dentry_test_bit(status, set);
if (status < 0) {
if (status == -EINVAL) {
/*
@@ -90,18 +90,14 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
* as an inode, we return -ESTALE to be
* nice
*/
- mlog(0, "test inode bit failed %d\n", status);
status = -ESTALE;
- } else {
+ } else
mlog(ML_ERROR, "test inode bit failed %d\n", status);
- }
goto unlock_nfs_sync;
}
/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
- mlog(0, "inode %llu suballoc bit is clear\n",
- (unsigned long long)blkno);
status = -ESTALE;
goto unlock_nfs_sync;
}
@@ -114,8 +110,8 @@ unlock_nfs_sync:
check_err:
if (status < 0) {
if (status == -ESTALE) {
- mlog(0, "stale inode ino: %llu generation: %u\n",
- (unsigned long long)blkno, handle->ih_generation);
+ trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+ handle->ih_generation);
}
result = ERR_PTR(status);
goto bail;
@@ -130,20 +126,19 @@ check_err:
check_gen:
if (handle->ih_generation != inode->i_generation) {
iput(inode);
- mlog(0, "stale inode ino: %llu generation: %u\n",
- (unsigned long long)blkno, handle->ih_generation);
+ trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+ handle->ih_generation,
+ inode->i_generation);
result = ERR_PTR(-ESTALE);
goto bail;
}
result = d_obtain_alias(inode);
- if (!IS_ERR(result))
- result->d_op = &ocfs2_dentry_ops;
- else
+ if (IS_ERR(result))
mlog_errno(PTR_ERR(result));
bail:
- mlog_exit_ptr(result);
+ trace_ocfs2_get_dentry_end(result);
return result;
}
@@ -154,11 +149,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
struct dentry *parent;
struct inode *dir = child->d_inode;
- mlog_entry("(0x%p, '%.*s')\n", child,
- child->d_name.len, child->d_name.name);
-
- mlog(0, "find parent of directory %llu\n",
- (unsigned long long)OCFS2_I(dir)->ip_blkno);
+ trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno);
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
@@ -175,55 +167,53 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
}
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
- if (!IS_ERR(parent))
- parent->d_op = &ocfs2_dentry_ops;
bail_unlock:
ocfs2_inode_unlock(dir, 0);
bail:
- mlog_exit_ptr(parent);
+ trace_ocfs2_get_parent_end(parent);
return parent;
}
-static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
- int connectable)
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+ struct inode *parent)
{
- struct inode *inode = dentry->d_inode;
int len = *max_len;
int type = 1;
u64 blkno;
u32 generation;
__le32 *fh = (__force __le32 *) fh_in;
- mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
- dentry->d_name.len, dentry->d_name.name,
- fh, len, connectable);
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then. Somehow"
+ trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ fh, len, connectable);
+#endif
- if (len < 3 || (connectable && len < 6)) {
- mlog(ML_ERROR, "fh buffer is too small for encoding\n");
- type = 255;
+ if (parent && (len < 6)) {
+ *max_len = 6;
+ type = FILEID_INVALID;
+ goto bail;
+ } else if (len < 3) {
+ *max_len = 3;
+ type = FILEID_INVALID;
goto bail;
}
blkno = OCFS2_I(inode)->ip_blkno;
generation = inode->i_generation;
- mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
- (unsigned long long)blkno, generation);
+ trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
len = 3;
fh[0] = cpu_to_le32((u32)(blkno >> 32));
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[2] = cpu_to_le32(generation);
- if (connectable && !S_ISDIR(inode->i_mode)) {
- struct inode *parent;
-
- spin_lock(&dentry->d_lock);
-
- parent = dentry->d_parent->d_inode;
+ if (parent) {
blkno = OCFS2_I(parent)->ip_blkno;
generation = parent->i_generation;
@@ -231,19 +221,17 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
fh[5] = cpu_to_le32(generation);
- spin_unlock(&dentry->d_lock);
-
len = 6;
type = 2;
- mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
- (unsigned long long)blkno, generation);
+ trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+ generation);
}
*max_len = len;
bail:
- mlog_exit(type);
+ trace_ocfs2_encode_fh_type(type);
return type;
}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c562a7581cf..767370b656c 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,10 +24,10 @@
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fiemap.h>
-#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -38,6 +38,7 @@
#include "inode.h"
#include "super.h"
#include "symlink.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -281,8 +282,7 @@ search:
spin_unlock(&oi->ip_lock);
out:
- if (new_emi)
- kfree(new_emi);
+ kfree(new_emi);
}
static int ocfs2_last_eb_is_empty(struct inode *inode,
@@ -781,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
cpos = map_start >> osb->s_clustersize_bits;
mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
map_start + map_len);
- mapping_end -= cpos;
is_last = 0;
while (cpos < mapping_end && !is_last) {
u32 fe_flags;
@@ -790,7 +789,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
&hole_size, &rec, &is_last);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_unlock;
}
if (rec.e_blkno == 0ULL) {
@@ -831,6 +830,100 @@ out:
return ret;
}
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret;
+ unsigned int is_last = 0, is_data = 0;
+ u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 cpos, cend, clen, hole_size;
+ u64 extoff, extlen;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (*offset >= i_size_read(inode)) {
+ ret = -ENXIO;
+ goto out_unlock;
+ }
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ if (whence == SEEK_HOLE)
+ *offset = i_size_read(inode);
+ goto out_unlock;
+ }
+
+ clen = 0;
+ cpos = *offset >> cs_bits;
+ cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
+ while (cpos < cend && !is_last) {
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+ &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ extoff = cpos;
+ extoff <<= cs_bits;
+
+ if (rec.e_blkno == 0ULL) {
+ clen = hole_size;
+ is_data = 0;
+ } else {
+ clen = le16_to_cpu(rec.e_leaf_clusters) -
+ (cpos - le32_to_cpu(rec.e_cpos));
+ is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
+ }
+
+ if ((!is_data && whence == SEEK_HOLE) ||
+ (is_data && whence == SEEK_DATA)) {
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ if (!is_last)
+ cpos += clen;
+ }
+
+ if (whence == SEEK_HOLE) {
+ extoff = cpos;
+ extoff <<= cs_bits;
+ extlen = clen;
+ extlen <<= cs_bits;
+
+ if ((extoff + extlen) > i_size_read(inode))
+ extlen = i_size_read(inode) - extoff;
+ extoff += extlen;
+ if (extoff > *offset)
+ *offset = extoff;
+ goto out_unlock;
+ }
+
+ ret = -ENXIO;
+
+out_unlock:
+
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+ return ret;
+}
+
int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
struct buffer_head *bhs[], int flags,
int (*validate)(struct super_block *sb,
@@ -840,10 +933,9 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
u64 p_block, p_count;
int i, count, done = 0;
- mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
- "flags = %x, validate = %p)\n",
- inode, (unsigned long long)v_block, nr, bhs, flags,
- validate);
+ trace_ocfs2_read_virt_blocks(
+ inode, (unsigned long long)v_block, nr, bhs, flags,
+ validate);
if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
i_size_read(inode)) {
@@ -896,7 +988,6 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
}
out:
- mlog_exit(rc);
return rc;
}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c90..67ea57d2fd5 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 map_start, u64 map_len);
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 17947dc8341..2930e231f3f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -36,8 +36,8 @@
#include <linux/writeback.h>
#include <linux/falloc.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -60,15 +60,10 @@
#include "acl.h"
#include "quota.h"
#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
- return sync_mapping_buffers(inode->i_mapping);
-}
-
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp;
@@ -104,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
int mode = file->f_flags;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
- file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+ trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name, mode);
if (file->f_mode & FMODE_WRITE)
dquot_initialize(inode);
@@ -140,7 +137,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
}
leave:
- mlog_exit(status);
return status;
}
@@ -148,19 +144,19 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
- file->f_path.dentry->d_name.len,
- file->f_path.dentry->d_name.name);
-
spin_lock(&oi->ip_lock);
if (!--oi->ip_open_count)
oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+ trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+ oi->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ oi->ip_open_count);
spin_unlock(&oi->ip_lock);
ocfs2_free_file_private(inode, file);
- mlog_exit(0);
-
return 0;
}
@@ -175,30 +171,44 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
return 0;
}
-static int ocfs2_sync_file(struct file *file,
- struct dentry *dentry,
+static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
int datasync)
{
int err = 0;
- journal_t *journal;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
+ int ret;
+ tid_t commit_tid;
+ bool needs_barrier = false;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+ OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ (unsigned long long)datasync);
- err = ocfs2_sync_inode(dentry->d_inode);
- if (err)
- goto bail;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto bail;
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ return err;
- journal = osb->journal->j_journal;
- err = jbd2_journal_force_commit(journal);
+ commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ err = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!err)
+ err = ret;
+ }
-bail:
- mlog_exit(err);
+ if (err)
+ mlog_errno(err);
return (err < 0) ? -EIO : 0;
}
@@ -254,8 +264,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
handle_t *handle;
struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
- mlog_entry_void();
-
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -278,15 +286,12 @@ int ocfs2_update_inode_atime(struct inode *inode,
inode->i_atime = CURRENT_TIME;
di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+ ocfs2_journal_dirty(handle, bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
- mlog_exit(ret);
return ret;
}
@@ -297,7 +302,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
{
int status;
- mlog_entry_void();
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -309,7 +313,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
}
bail:
- mlog_exit(status);
return status;
}
@@ -333,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_commit_trans(osb, handle);
out:
return ret;
@@ -381,8 +385,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
u64 cluster_bytes;
- mlog_entry_void();
-
/*
* We need to CoW the cluster contains the offset if it is reflinked
* since we will call ocfs2_zero_range_for_truncate later which will
@@ -429,16 +431,13 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
di->i_size = cpu_to_le64(new_i_size);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, fe_bh);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
-
- mlog_exit(status);
return status;
}
@@ -449,16 +448,15 @@ static int ocfs2_truncate_file(struct inode *inode,
int status = 0;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_truncate_context *tc = NULL;
-
- mlog_entry("(inode = %llu, new_i_size = %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)new_i_size);
/* We trust di_bh because it comes from ocfs2_inode_lock(), which
* already validated it */
fe = (struct ocfs2_dinode *) di_bh->b_data;
+ trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ (unsigned long long)new_i_size);
+
mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
"Inode %llu, inode i_size = %lld != di "
"i_size = %llu, i_flags = 0x%x\n",
@@ -468,26 +466,19 @@ static int ocfs2_truncate_file(struct inode *inode,
le32_to_cpu(fe->i_flags));
if (new_i_size > le64_to_cpu(fe->i_size)) {
- mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
- (unsigned long long)le64_to_cpu(fe->i_size),
- (unsigned long long)new_i_size);
+ trace_ocfs2_truncate_file_error(
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ (unsigned long long)new_i_size);
status = -EINVAL;
mlog_errno(status);
goto bail;
}
- mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- (unsigned long long)le64_to_cpu(fe->i_size),
- (unsigned long long)new_i_size);
-
- /* lets handle the simple truncate cases before doing any more
- * cluster locking. */
- if (new_i_size == le64_to_cpu(fe->i_size))
- goto bail;
-
down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ocfs2_resv_discard(&osb->osb_la_resmap,
+ &OCFS2_I(inode)->ip_la_data_resv);
+
/*
* The inode lock forced other nodes to sync and drop their
* pages, which (correctly) happens even if we have a truncate
@@ -517,13 +508,7 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail_unlock_sem;
}
- status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto bail_unlock_sem;
- }
-
- status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, di_bh);
if (status < 0) {
mlog_errno(status);
goto bail_unlock_sem;
@@ -537,7 +522,6 @@ bail:
if (!status && OCFS2_I(inode)->ip_clusters == 0)
status = ocfs2_try_remove_refcount_tree(inode, di_bh);
- mlog_exit(status);
return status;
}
@@ -590,10 +574,8 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
struct ocfs2_extent_tree et;
int did_quota = 0;
- mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
-
/*
- * This function only exists for file systems which don't
+ * Unwritten extent only exists for file systems which
* support holes.
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
@@ -608,11 +590,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
- "clusters_to_add = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
- clusters_to_add);
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
&data_ac, &meta_ac);
@@ -621,8 +598,7 @@ restart_all:
goto leave;
}
- credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
- clusters_to_add);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -632,6 +608,12 @@ restart_all:
}
restarted_transaction:
+ trace_ocfs2_extend_allocation(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)i_size_read(inode),
+ le32_to_cpu(fe->i_clusters), clusters_to_add,
+ why, restart_func);
+
status = dquot_alloc_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (status)
@@ -665,12 +647,8 @@ restarted_transaction:
mlog_errno(status);
goto leave;
}
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(inode)->ip_lock);
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -682,17 +660,12 @@ restarted_transaction:
if (why != RESTART_NONE && clusters_to_add) {
if (why == RESTART_META) {
- mlog(0, "restarting function.\n");
restart_func = 1;
+ status = 0;
} else {
BUG_ON(why != RESTART_TRANS);
- mlog(0, "restarting transaction.\n");
- /* TODO: This can be more intelligent. */
- credits = ocfs2_calc_extend_credits(osb->sb,
- &fe->id2.i_list,
- clusters_to_add);
- status = ocfs2_extend_trans(handle, credits);
+ status = ocfs2_allocate_extend_trans(handle, 1);
if (status < 0) {
/* handle still has to be committed at
* this point. */
@@ -704,11 +677,11 @@ restarted_transaction:
}
}
- mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+ trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
le32_to_cpu(fe->i_clusters),
- (unsigned long long)le64_to_cpu(fe->i_size));
- mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
- OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+ (unsigned long long)le64_to_cpu(fe->i_size),
+ OCFS2_I(inode)->ip_clusters,
+ (unsigned long long)i_size_read(inode));
leave:
if (status < 0 && did_quota)
@@ -733,65 +706,142 @@ leave:
brelse(bh);
bh = NULL;
- mlog_exit(status);
return status;
}
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ if (!ocfs2_should_order_data(inode))
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+
+out:
+ if (ret) {
+ if (!IS_ERR(handle))
+ ocfs2_commit_trans(osb, handle);
+ handle = ERR_PTR(ret);
+ }
+ return handle;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
- u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+ u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index;
- unsigned int offset;
+ unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
handle_t *handle = NULL;
- int ret;
+ int ret = 0;
+ unsigned zero_from, zero_to, block_start, block_end;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
- /* ugh. in prepare/commit_write, if from==to==start of block, we
- ** skip the prepare. make sure we never send an offset for the start
- ** of a block
- */
- if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
- offset++;
- }
- index = size >> PAGE_CACHE_SHIFT;
+ BUG_ON(abs_from >= abs_to);
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_from & (inode->i_blkbits - 1));
- page = grab_cache_page(mapping, index);
+ page = find_or_create_page(mapping, index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /* Get the offsets within the page that we want to zero */
+ zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+ zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ if (!zero_to)
+ zero_to = PAGE_CACHE_SIZE;
- if (ocfs2_should_order_data(inode)) {
- handle = ocfs2_start_walk_page_trans(inode, page, offset,
- offset);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ trace_ocfs2_write_zero_page(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)abs_from,
+ (unsigned long long)abs_to,
+ index, zero_from, zero_to);
+
+ /* We know that zero_from is block aligned */
+ for (block_start = zero_from; block_start < zero_to;
+ block_start = block_end) {
+ block_end = block_start + (1 << inode->i_blkbits);
+
+ /*
+ * block_start is block-aligned. Bump it by one to force
+ * __block_write_begin and block_commit_write to zero the
+ * whole block.
+ */
+ ret = __block_write_begin(page, block_start + 1, 0,
+ ocfs2_get_block);
+ if (ret < 0) {
+ mlog_errno(ret);
goto out_unlock;
}
- }
- /* must not update i_size! */
- ret = block_commit_write(page, offset, offset);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ if (!handle) {
+ handle = ocfs2_zero_start_ordered_transaction(inode,
+ di_bh);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ break;
+ }
+ }
- if (handle)
+ /* must not update i_size! */
+ ret = block_commit_write(page, block_start + 1,
+ block_start + 1);
+ if (ret < 0)
+ mlog_errno(ret);
+ else
+ ret = 0;
+ }
+
+ if (handle) {
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_page().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ di->i_mtime_nsec = di->i_ctime_nsec;
+ ocfs2_journal_dirty(handle, di_bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ }
+
out_unlock:
unlock_page(page);
page_cache_release(page);
@@ -799,22 +849,115 @@ out:
return ret;
}
-static int ocfs2_zero_extend(struct inode *inode,
- u64 zero_to_size)
+/*
+ * Find the next range to zero. We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache. We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed. range_start and range_end return the next zeroing
+ * range. A subsequent call should pass the previous range_end as its
+ * zero_start. If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over. Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 zero_start, u64 zero_end,
+ u64 *range_start, u64 *range_end)
{
- int ret = 0;
- u64 start_off;
- struct super_block *sb = inode->i_sb;
+ int rc = 0, needs_cow = 0;
+ u32 p_cpos, zero_clusters = 0;
+ u32 zero_cpos =
+ zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
- start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
- while (start_off < zero_to_size) {
- ret = ocfs2_write_zero_page(inode, start_off);
- if (ret < 0) {
- mlog_errno(ret);
+ while (zero_cpos < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
goto out;
}
- start_off += sb->s_blocksize;
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ zero_clusters = num_clusters;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ break;
+ }
+
+ zero_cpos += num_clusters;
+ }
+ if (!zero_clusters) {
+ *range_end = 0;
+ goto out;
+ }
+
+ while ((zero_cpos + zero_clusters) < last_cpos) {
+ rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+ &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+ break;
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ needs_cow = 1;
+ zero_clusters += num_clusters;
+ }
+ if ((zero_cpos + zero_clusters) > last_cpos)
+ zero_clusters = last_cpos - zero_cpos;
+
+ if (needs_cow) {
+ rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
+ zero_clusters, UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+ *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+ zero_cpos + zero_clusters);
+
+out:
+ return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range(). The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+ u64 range_end, struct buffer_head *di_bh)
+{
+ int rc = 0;
+ u64 next_pos;
+ u64 zero_pos = range_start;
+
+ trace_ocfs2_zero_extend_range(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range_start,
+ (unsigned long long)range_end);
+ BUG_ON(range_start >= range_end);
+
+ while (zero_pos < range_end) {
+ next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ if (next_pos > range_end)
+ next_pos = range_end;
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
+ if (rc < 0) {
+ mlog_errno(rc);
+ break;
+ }
+ zero_pos = next_pos;
/*
* Very large extends have the potential to lock up
@@ -823,16 +966,63 @@ static int ocfs2_zero_extend(struct inode *inode,
cond_resched();
}
-out:
+ return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to_size)
+{
+ int ret = 0;
+ u64 zero_start, range_start = 0, range_end = 0;
+ struct super_block *sb = inode->i_sb;
+
+ zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+ trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)zero_start,
+ (unsigned long long)i_size_read(inode));
+ while (zero_start < zero_to_size) {
+ ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+ zero_to_size,
+ &range_start,
+ &range_end);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ if (!range_end)
+ break;
+ /* Trim the ends */
+ if (range_start < zero_start)
+ range_start = zero_start;
+ if (range_end > zero_to_size)
+ range_end = zero_to_size;
+
+ ret = ocfs2_zero_extend_range(inode, range_start,
+ range_end, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ zero_start = range_end;
+ }
+
return ret;
}
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to)
{
int ret;
u32 clusters_to_add;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ /*
+ * Only quota files call this without a bh, and they can't be
+ * refcounted.
+ */
+ BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+ BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
if (clusters_to_add < oi->ip_clusters)
clusters_to_add = 0;
@@ -853,7 +1043,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
* still need to zero the area between the old i_size and the
* new i_size.
*/
- ret = ocfs2_zero_extend(inode, zero_to);
+ ret = ocfs2_zero_extend(inode, di_bh, zero_to);
if (ret < 0)
mlog_errno(ret);
@@ -875,27 +1065,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;
if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));
/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);
@@ -912,14 +1090,16 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+ new_i_size);
up_write(&oi->ip_alloc_sem);
@@ -945,43 +1125,33 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
handle_t *handle = NULL;
- int qtype;
- struct dquot *transfer_from[MAXQUOTAS] = { };
struct dquot *transfer_to[MAXQUOTAS] = { };
+ int qtype;
- mlog_entry("(0x%p, '%.*s')\n", dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_setattr(inode, dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ dentry->d_name.len, dentry->d_name.name,
+ attr->ia_valid, attr->ia_mode,
+ from_kuid(&init_user_ns, attr->ia_uid),
+ from_kgid(&init_user_ns, attr->ia_gid));
/* ensuring we don't even attempt to truncate a symlink */
if (S_ISLNK(inode->i_mode))
attr->ia_valid &= ~ATTR_SIZE;
- if (attr->ia_valid & ATTR_MODE)
- mlog(0, "mode change: %d\n", attr->ia_mode);
- if (attr->ia_valid & ATTR_UID)
- mlog(0, "uid change: %d\n", attr->ia_uid);
- if (attr->ia_valid & ATTR_GID)
- mlog(0, "gid change: %d\n", attr->ia_gid);
- if (attr->ia_valid & ATTR_SIZE)
- mlog(0, "size change...\n");
- if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
- mlog(0, "time change...\n");
-
#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
| ATTR_GID | ATTR_UID | ATTR_MODE)
- if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
- mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+ if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
return 0;
- }
status = inode_change_ok(inode, attr);
if (status)
return status;
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
- dquot_initialize(inode);
-
status = ocfs2_rw_lock(inode, 1);
if (status < 0) {
mlog_errno(status);
@@ -996,12 +1166,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock_rw;
}
- if (size_change && attr->ia_size != i_size_read(inode)) {
+ if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
if (status)
goto bail_unlock;
- if (i_size_read(inode) > attr->ia_size) {
+ inode_dio_wait(inode);
+
+ if (i_size_read(inode) >= attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
attr->ia_size);
@@ -1019,33 +1191,27 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
- (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+ (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
/*
* Gather pointers to quota structures so that allocation /
* freeing of quota structures happens here and not inside
* dquot_transfer() where we have problems with lock ordering
*/
- if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+ if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
- transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
- USRQUOTA);
- transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
- USRQUOTA);
- if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
+ transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
+ if (!transfer_to[USRQUOTA]) {
status = -ESRCH;
goto bail_unlock;
}
}
- if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+ if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
- transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
- GRPQUOTA);
- transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
- GRPQUOTA);
- if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
+ transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
+ if (!transfer_to[GRPQUOTA]) {
status = -ESRCH;
goto bail_unlock;
}
@@ -1057,7 +1223,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock;
}
- status = dquot_transfer(inode, attr);
+ status = __dquot_transfer(inode, transfer_to);
if (status < 0)
goto bail_commit;
} else {
@@ -1069,18 +1235,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- /*
- * This will intentionally not wind up calling vmtruncate(),
- * since all the work for a size change has been done above.
- * Otherwise, we could get into problems with truncate as
- * ip_alloc_sem is used there to protect against i_size
- * changes.
- */
- status = inode_setattr(inode, attr);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
status = ocfs2_mark_inode_dirty(handle, inode, bh);
if (status < 0)
@@ -1097,18 +1253,15 @@ bail:
brelse(bh);
/* Release quota pointers in case we acquired them */
- for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+ for (qtype = 0; qtype < MAXQUOTAS; qtype++)
dqput(transfer_to[qtype]);
- dqput(transfer_from[qtype]);
- }
if (!status && attr->ia_valid & ATTR_MODE) {
- status = ocfs2_acl_chmod(inode);
+ status = posix_acl_chmod(inode, inode->i_mode);
if (status < 0)
mlog_errno(status);
}
- mlog_exit(status);
return status;
}
@@ -1121,8 +1274,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
struct ocfs2_super *osb = sb->s_fs_info;
int err;
- mlog_entry_void();
-
err = ocfs2_inode_revalidate(dentry);
if (err) {
if (err != -ENOENT)
@@ -1136,8 +1287,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
stat->blksize = osb->s_clustersize;
bail:
- mlog_exit(err);
-
return err;
}
@@ -1145,7 +1294,8 @@ int ocfs2_permission(struct inode *inode, int mask)
{
int ret;
- mlog_entry_void();
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret) {
@@ -1154,11 +1304,10 @@ int ocfs2_permission(struct inode *inode, int mask)
goto out;
}
- ret = generic_permission(inode, mask, ocfs2_check_acl);
+ ret = generic_permission(inode, mask);
ocfs2_inode_unlock(inode, 0);
out:
- mlog_exit(ret);
return ret;
}
@@ -1170,8 +1319,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;
- mlog_entry("(Inode %llu, mode 0%o)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+ trace_ocfs2_write_remove_suid(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ inode->i_mode);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
@@ -1193,15 +1343,13 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
di = (struct ocfs2_dinode *) bh->b_data;
di->i_mode = cpu_to_le16(inode->i_mode);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, bh);
out_trans:
ocfs2_commit_trans(osb, handle);
out:
- mlog_exit(ret);
return ret;
}
@@ -1380,8 +1528,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
* partial clusters here. There's no need to worry about
* physical allocation - the zeroing code knows to skip holes.
*/
- mlog(0, "byte start: %llu, end: %llu\n",
- (unsigned long long)start, (unsigned long long)end);
+ trace_ocfs2_zero_partial_clusters(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)start, (unsigned long long)end);
/*
* If both edges are on a cluster boundary then there's no
@@ -1405,8 +1554,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if (tmpend > end)
tmpend = end;
- mlog(0, "1st range: start: %llu, tmpend: %llu\n",
- (unsigned long long)start, (unsigned long long)tmpend);
+ trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+ (unsigned long long)tmpend);
ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
if (ret)
@@ -1420,33 +1569,125 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
*/
start = end & ~(osb->s_clustersize - 1);
- mlog(0, "2nd range: start: %llu, end: %llu\n",
- (unsigned long long)start, (unsigned long long)end);
+ trace_ocfs2_zero_partial_clusters_range2(
+ (unsigned long long)start, (unsigned long long)end);
ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
if (ret)
mlog_errno(ret);
}
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(osb, handle);
out:
return ret;
}
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+ int i;
+ struct ocfs2_extent_rec *rec = NULL;
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) < pos)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+ struct ocfs2_extent_list *el,
+ struct ocfs2_extent_rec *rec,
+ u32 trunc_start, u32 *trunc_cpos,
+ u32 *trunc_len, u32 *trunc_end,
+ u64 *blkno, int *done)
+{
+ int ret = 0;
+ u32 coff, range;
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+ /*
+ * remove an entire extent record.
+ */
+ *trunc_cpos = le32_to_cpu(rec->e_cpos);
+ /*
+ * Skip holes if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno);
+ *trunc_end = le32_to_cpu(rec->e_cpos);
+ } else if (range > trunc_start) {
+ /*
+ * remove a partial extent record, which means we're
+ * removing the last extent record.
+ */
+ *trunc_cpos = trunc_start;
+ /*
+ * skip hole if any.
+ */
+ if (range < *trunc_end)
+ *trunc_end = range;
+ *trunc_len = *trunc_end - trunc_start;
+ coff = trunc_start - le32_to_cpu(rec->e_cpos);
+ *blkno = le64_to_cpu(rec->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb, coff);
+ *trunc_end = trunc_start;
+ } else {
+ /*
+ * It may have two following possibilities:
+ *
+ * - last record has been removed
+ * - trunc_start was within a hole
+ *
+ * both two cases mean the completion of hole punching.
+ */
+ ret = 1;
+ }
+
+ *done = ret;
+}
+
static int ocfs2_remove_inode_range(struct inode *inode,
struct buffer_head *di_bh, u64 byte_start,
u64 byte_len)
{
- int ret = 0;
- u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+ int ret = 0, flags = 0, done = 0, i;
+ u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+ u32 cluster_in_el;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_cached_dealloc_ctxt dealloc;
struct address_space *mapping = inode->i_mapping;
struct ocfs2_extent_tree et;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el = NULL;
+ struct ocfs2_extent_rec *rec = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
+ trace_ocfs2_remove_inode_range(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)byte_start,
+ (unsigned long long)byte_len);
+
if (byte_len == 0)
return 0;
@@ -1468,17 +1709,30 @@ static int ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
- trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
- if (trunc_len >= trunc_start)
- trunc_len -= trunc_start;
- else
- trunc_len = 0;
+ /*
+ * For reflinks, we may need to CoW 2 clusters which might be
+ * partially zero'd later, if hole's start and end offset were
+ * within one cluster(means is not exactly aligned to clustersize).
+ */
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
- mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)byte_start,
- (unsigned long long)byte_len, trunc_start, trunc_len);
+ trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+ trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
+ cluster_in_el = trunc_end;
ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
if (ret) {
@@ -1486,36 +1740,85 @@ static int ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- cpos = trunc_start;
- while (trunc_len) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
- &alloc_size, NULL);
+ path = ocfs2_new_path_from_et(&et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (trunc_end > trunc_start) {
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path,
+ cluster_in_el);
if (ret) {
mlog_errno(ret);
goto out;
}
- if (alloc_size > trunc_len)
- alloc_size = trunc_len;
+ el = path_leaf_el(path);
+
+ i = ocfs2_find_rec(el, trunc_end);
+ /*
+ * Need to go to previous extent block.
+ */
+ if (i < 0) {
+ if (path->p_tree_depth == 0)
+ break;
- /* Only do work for non-holes */
- if (phys_cpos != 0) {
- ret = ocfs2_remove_btree_range(inode, &et, cpos,
- phys_cpos, alloc_size,
- &dealloc);
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+ path,
+ &cluster_in_el);
if (ret) {
mlog_errno(ret);
goto out;
}
+
+ /*
+ * We've reached the leftmost extent block,
+ * it's safe to leave.
+ */
+ if (cluster_in_el == 0)
+ break;
+
+ /*
+ * The 'pos' searched for previous extent block is
+ * always one cluster less than actual trunc_end.
+ */
+ trunc_end = cluster_in_el + 1;
+
+ ocfs2_reinit_path(path, 1);
+
+ continue;
+
+ } else
+ rec = &el->l_recs[i];
+
+ ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+ &trunc_len, &trunc_end, &blkno, &done);
+ if (done)
+ break;
+
+ flags = rec->e_flags;
+ phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+
+ ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+ phys_cpos, trunc_len, flags,
+ &dealloc, refcount_loc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
- cpos += alloc_size;
- trunc_len -= alloc_size;
+ cluster_in_el = trunc_end;
+
+ ocfs2_reinit_path(path, 1);
}
ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
out:
+ ocfs2_free_path(path);
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
@@ -1589,7 +1892,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
size = sr->l_start + sr->l_len;
- if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+ cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
goto out_inode_unlock;
@@ -1647,6 +1951,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ if (file && (file->f_flags & O_SYNC))
+ handle->h_sync = 1;
+
ocfs2_commit_trans(osb, handle);
out_inode_unlock:
@@ -1663,8 +1970,9 @@ out:
int ocfs2_change_file_space(struct file *file, unsigned int cmd,
struct ocfs2_space_resv *sr)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
!ocfs2_writes_unwritten_extents(osb))
@@ -1679,31 +1987,40 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+ ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+ mnt_drop_write_file(file);
+ return ret;
}
-static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_space_resv sr;
int change_size = 1;
+ int cmd = OCFS2_IOC_RESVSP64;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
- if (S_ISDIR(inode->i_mode))
- return -ENODEV;
-
if (mode & FALLOC_FL_KEEP_SIZE)
change_size = 0;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = OCFS2_IOC_UNRESVSP64;
+
sr.l_whence = 0;
sr.l_start = (s64)offset;
sr.l_len = (s64)len;
- return __ocfs2_change_file_space(NULL, inode, offset,
- OCFS2_IOC_RESVSP64, &sr, change_size);
+ return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
+ change_size);
}
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
@@ -1745,7 +2062,18 @@ out:
return ret;
}
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+ int blockmask = inode->i_sb->s_blocksize - 1;
+ loff_t final_size = pos + count;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+ return 0;
+}
+
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+ struct file *file,
loff_t pos, size_t count,
int *meta_level)
{
@@ -1771,7 +2099,7 @@ out:
return ret;
}
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t *ppos,
size_t count,
int appending,
@@ -1779,8 +2107,9 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
int *has_refcount)
{
int ret = 0, meta_level = 0;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
- loff_t saved_pos, end;
+ loff_t saved_pos = 0, end;
/*
* We start with a read level meta lock and only jump to an ex
@@ -1799,7 +2128,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
* remove_suid() calls ->setattr without any hint that
* we may have already done our cluster locking. Since
* ocfs2_setattr() *must* take cluster locks to
- * proceeed, this will lead us to recursively lock the
+ * proceed, this will lead us to recursively lock the
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
@@ -1819,12 +2148,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
/* work on a copy of ppos until we're sure that we won't have
* to recalculate it due to relocking. */
- if (appending) {
+ if (appending)
saved_pos = i_size_read(inode);
- mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
- } else {
+ else
saved_pos = *ppos;
- }
end = saved_pos + count;
@@ -1834,6 +2161,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
meta_level = -1;
ret = ocfs2_prepare_inode_for_refcount(inode,
+ file,
saved_pos,
count,
&meta_level);
@@ -1894,6 +2222,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
*ppos = saved_pos;
out_unlock:
+ trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+ saved_pos, appending, count,
+ direct_io, has_refcount);
+
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -1901,68 +2233,96 @@ out:
return ret;
}
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
int can_do_direct, has_refcount = 0;
ssize_t written = 0;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
+ size_t count = iov_iter_count(from);
loff_t old_size, *ppos = &iocb->ki_pos;
u32 old_clusters;
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
+ int unaligned_dio = 0;
- mlog_entry("(0x%p, %u, '%.*s')\n", file,
- (unsigned int)nr_segs,
- file->f_path.dentry->d_name.len,
- file->f_path.dentry->d_name.name);
+ trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name,
+ (unsigned int)from->nr_segs); /* GRRRRR */
- if (iocb->ki_left == 0)
+ if (iocb->ki_nbytes == 0)
return 0;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
appending = file->f_flags & O_APPEND ? 1 : 0;
direct_io = file->f_flags & O_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
+ ocfs2_iocb_clear_sem_locked(iocb);
+
relock:
- /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
+ /* to match setattr's i_mutex -> rw_lock ordering */
if (direct_io) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ /* communicate with ocfs2_dio_end_io */
+ ocfs2_iocb_set_sem_locked(iocb);
}
- /* concurrent O_DIRECT writes are allowed */
- rw_level = !direct_io;
+ /*
+ * Concurrent O_DIRECT writes are allowed with
+ * mount_option "coherency=buffered".
+ */
+ rw_level = (!direct_io || full_coherency);
+
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
mlog_errno(ret);
goto out_sems;
}
+ /*
+ * O_DIRECT writes with "coherency=full" need to take EX cluster
+ * inode_lock to guarantee coherency.
+ */
+ if (direct_io && full_coherency) {
+ /*
+ * We need to take and drop the inode lock to force
+ * other nodes to drop their caches. Buffered I/O
+ * already does this in write_begin().
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ }
+
can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
- iocb->ki_left, appending,
+ ret = ocfs2_prepare_inode_for_write(file, ppos,
+ iocb->ki_nbytes, appending,
&can_do_direct, &has_refcount);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
+ if (direct_io && !is_sync_kiocb(iocb))
+ unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
+ *ppos);
+
/*
* We can't complete the direct I/O as requested, fall back to
* buffered I/O.
*/
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
- up_read(&inode->i_alloc_sem);
have_alloc_sem = 0;
rw_level = -1;
@@ -1971,6 +2331,16 @@ relock:
goto relock;
}
+ if (unaligned_dio) {
+ /*
+ * Wait on previous unaligned aio to complete before
+ * proceeding.
+ */
+ mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+ /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
+ ocfs2_iocb_set_unaligned_aio(iocb);
+ }
+
/*
* To later detect whether a journal commit for sync writes is
* necessary, we sample i_size, and cluster count here.
@@ -1981,33 +2351,24 @@ relock:
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
- if (direct_io) {
- ret = generic_segment_checks(iov, &nr_segs, &ocount,
- VERIFY_READ);
- if (ret)
- goto out_dio;
-
- count = ocount;
- ret = generic_write_checks(file, ppos, &count,
- S_ISBLK(inode->i_mode));
- if (ret)
- goto out_dio;
+ ret = generic_write_checks(file, ppos, &count,
+ S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_dio;
- written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
- ppos, count, ocount);
+ iov_iter_truncate(from, count);
+ if (direct_io) {
+ written = generic_file_direct_write(iocb, from, *ppos);
if (written < 0) {
- /*
- * direct write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- * Don't need i_size_read because we hold i_mutex.
- */
- if (*ppos + count > inode->i_size)
- vmtruncate(inode, inode->i_size);
ret = written;
goto out_dio;
}
} else {
- written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
+ current->backing_dev_info = file->f_mapping->backing_dev_info;
+ written = generic_perform_write(file, from, *ppos);
+ if (likely(written >= 0))
+ iocb->ki_pos = *ppos + written;
+ current->backing_dev_info = NULL;
}
out_dio:
@@ -2015,30 +2376,29 @@ out_dio:
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
- ((file->f_flags & O_DIRECT) && has_refcount)) {
- ret = filemap_fdatawrite_range(file->f_mapping, pos,
- pos + count - 1);
+ ((file->f_flags & O_DIRECT) && !direct_io)) {
+ ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
if (ret < 0)
written = ret;
- if (!ret && (old_size != i_size_read(inode) ||
- old_clusters != OCFS2_I(inode)->ip_clusters ||
- has_refcount)) {
+ if (!ret && ((old_size != i_size_read(inode)) ||
+ (old_clusters != OCFS2_I(inode)->ip_clusters) ||
+ has_refcount)) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, pos,
- pos + count - 1);
+ ret = filemap_fdatawait_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
}
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock. (it's the clustered equivalent of
- * i_alloc_sem; protects truncate from racing with pending ios).
+ * it can unlock our rw lock.
* Unfortunately there are error cases which call end_io and others
* that don't. so we don't have to unlock the rw_lock if either an
* async dio is going to do it in the future or an end_io after an
@@ -2047,6 +2407,12 @@ out_dio:
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
have_alloc_sem = 0;
+ unaligned_dio = 0;
+ }
+
+ if (unaligned_dio) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
out:
@@ -2055,96 +2421,12 @@ out:
out_sems:
if (have_alloc_sem)
- up_read(&inode->i_alloc_sem);
+ ocfs2_iocb_clear_sem_locked(iocb);
mutex_unlock(&inode->i_mutex);
if (written)
ret = written;
- mlog_exit(ret);
- return ret;
-}
-
-static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
- struct file *out,
- struct splice_desc *sd)
-{
- int ret;
-
- ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
- sd->total_len, 0, NULL, NULL);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- return splice_from_pipe_feed(pipe, sd, pipe_to_file);
-}
-
-static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
- int ret;
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- struct splice_desc sd = {
- .total_len = len,
- .flags = flags,
- .pos = *ppos,
- .u.file = out,
- };
-
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
- (unsigned int)len,
- out->f_path.dentry->d_name.len,
- out->f_path.dentry->d_name.name);
-
- if (pipe->inode)
- mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
-
- splice_from_pipe_begin(&sd);
- do {
- ret = splice_from_pipe_next(pipe, &sd);
- if (ret <= 0)
- break;
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0)
- mlog_errno(ret);
- else {
- ret = ocfs2_splice_to_file(pipe, out, &sd);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
- splice_from_pipe_end(pipe, &sd);
-
- if (pipe->inode)
- mutex_unlock(&pipe->inode->i_mutex);
-
- if (sd.num_spliced)
- ret = sd.num_spliced;
-
- if (ret > 0) {
- unsigned long nr_pages;
- int err;
-
- nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
- err = generic_write_sync(out, *ppos, ret);
- if (err)
- ret = err;
- else
- *ppos += ret;
-
- balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
- }
-
- mlog_exit(ret);
return ret;
}
@@ -2155,17 +2437,17 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
unsigned int flags)
{
int ret = 0, lock_level = 0;
- struct inode *inode = in->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(in);
- mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
- (unsigned int)len,
- in->f_path.dentry->d_name.len,
- in->f_path.dentry->d_name.name);
+ trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ in->f_path.dentry->d_name.len,
+ in->f_path.dentry->d_name.name, len);
/*
- * See the comment in ocfs2_file_aio_read()
+ * See the comment in ocfs2_file_read_iter()
*/
- ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
@@ -2175,23 +2457,22 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
ret = generic_file_splice_read(in, ppos, pipe, len, flags);
bail:
- mlog_exit(ret);
return ret;
}
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
{
int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
struct file *filp = iocb->ki_filp;
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
+
+ trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ filp->f_path.dentry->d_name.len,
+ filp->f_path.dentry->d_name.name,
+ to->nr_segs); /* GRRRRR */
- mlog_entry("(0x%p, %u, '%.*s')\n", filp,
- (unsigned int)nr_segs,
- filp->f_path.dentry->d_name.len,
- filp->f_path.dentry->d_name.name);
if (!inode) {
ret = -EINVAL;
@@ -2199,13 +2480,15 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
goto bail;
}
+ ocfs2_iocb_clear_sem_locked(iocb);
+
/*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
if (filp->f_flags & O_DIRECT) {
- down_read(&inode->i_alloc_sem);
have_alloc_sem = 1;
+ ocfs2_iocb_set_sem_locked(iocb);
ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
@@ -2226,21 +2509,20 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
* like i_size. This allows the checks down below
* generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
ocfs2_inode_unlock(inode, lock_level);
- ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
- if (ret == -EINVAL)
- mlog(0, "generic_file_aio_read returned -EINVAL\n");
+ ret = generic_file_read_iter(iocb, to);
+ trace_generic_file_aio_read_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
- /* see ocfs2_file_aio_write */
+ /* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
have_alloc_sem = 0;
@@ -2248,14 +2530,64 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
bail:
if (have_alloc_sem)
- up_read(&inode->i_alloc_sem);
+ ocfs2_iocb_clear_sem_locked(iocb);
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
- mlog_exit(ret);
return ret;
}
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ switch (whence) {
+ case SEEK_SET:
+ break;
+ case SEEK_END:
+ /* SEEK_END requires the OCFS2 inode lock for the file
+ * because it references the file's size.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ offset += i_size_read(inode);
+ ocfs2_inode_unlock(inode, 0);
+ break;
+ case SEEK_CUR:
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out;
+ }
+ offset += file->f_pos;
+ break;
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
+ if (ret)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (ret)
+ return ret;
+ return offset;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2264,14 +2596,17 @@ const struct inode_operations ocfs2_file_iops = {
.getxattr = generic_getxattr,
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
- .fallocate = ocfs2_fallocate,
.fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
/*
@@ -2279,15 +2614,15 @@ const struct inode_operations ocfs2_special_file_iops = {
* ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
*/
const struct file_operations ocfs2_fops = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .llseek = ocfs2_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
@@ -2295,13 +2630,14 @@ const struct file_operations ocfs2_fops = {
.lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
@@ -2326,28 +2662,29 @@ const struct file_operations ocfs2_dops = {
* the cluster.
*/
const struct file_operations ocfs2_fops_no_plocks = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .llseek = ocfs2_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ocfs2_fallocate,
};
const struct file_operations ocfs2_dops_no_plocks = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
- .readdir = ocfs2_readdir,
+ .iterate = ocfs2_readdir,
.fsync = ocfs2_sync_file,
.release = ocfs2_dir_release,
.open = ocfs2_dir_open,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70..97bf761c9e7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
- u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+ u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+ loff_t zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db86..d8208b20dc5 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,10 +26,8 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -38,6 +36,7 @@
#include "heartbeat.h"
#include "inode.h"
#include "journal.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -67,7 +66,7 @@ void ocfs2_do_node_down(int node_num, void *data)
BUG_ON(osb->node_num == node_num);
- mlog(0, "ocfs2: node down event for %d\n", node_num);
+ trace_ocfs2_do_node_down(node_num);
if (!osb->cconn) {
/*
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae1..437de7f768c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,14 +25,12 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <asm/byteorder.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -54,6 +52,7 @@
#include "uptodate.h"
#include "xattr.h"
#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -131,8 +130,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
- mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno);
+ trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+ sysfile_type);
/* Ok. By now we've either got the offsets passed to us by the
* caller, or we just pulled them off the bh. Lets do some
@@ -153,27 +154,52 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
/* inode was *not* in the inode cache. 2.6.x requires
* us to do our own read_inode call and unlock it
* afterwards. */
- if (inode && inode->i_state & I_NEW) {
- mlog(0, "Inode was not in inode cache, reading it.\n");
- ocfs2_read_locked_inode(inode, &args);
- unlock_new_inode(inode);
- }
if (inode == NULL) {
inode = ERR_PTR(-ENOMEM);
mlog_errno(PTR_ERR(inode));
goto bail;
}
+ trace_ocfs2_iget5_locked(inode->i_state);
+ if (inode->i_state & I_NEW) {
+ ocfs2_read_locked_inode(inode, &args);
+ unlock_new_inode(inode);
+ }
if (is_bad_inode(inode)) {
iput(inode);
inode = ERR_PTR(-ESTALE);
goto bail;
}
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ oi->i_sync_tid = tid;
+ oi->i_datasync_tid = tid;
+ }
+
bail:
if (!IS_ERR(inode)) {
- mlog(0, "returning inode with number %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_exit_ptr(inode);
+ trace_ocfs2_iget_end(inode,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
}
return inode;
@@ -193,18 +219,17 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
int ret = 0;
- mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
-
args = opaque;
mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+ trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
+
if (oi->ip_blkno != args->fi_blkno)
goto bail;
ret = 1;
bail:
- mlog_exit(ret);
return ret;
}
@@ -219,8 +244,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
ocfs2_file_ip_alloc_sem_key;
- mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
-
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
if (args->fi_sysfile_type != 0)
@@ -236,7 +259,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
&ocfs2_file_ip_alloc_sem_key);
- mlog_exit(0);
return 0;
}
@@ -247,9 +269,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
struct ocfs2_super *osb;
int use_plocks = 1;
- mlog_entry("(0x%p, size:%llu)\n", inode,
- (unsigned long long)le64_to_cpu(fe->i_size));
-
sb = inode->i_sb;
osb = OCFS2_SB(sb);
@@ -277,15 +296,17 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_generation = le32_to_cpu(fe->i_generation);
inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
inode->i_mode = le16_to_cpu(fe->i_mode);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
/* Fast symlinks will have i_size but no allocated clusters. */
- if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
inode->i_blocks = 0;
- else
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+ } else {
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mapping->a_ops = &ocfs2_aops;
+ inode->i_mapping->a_ops = &ocfs2_aops;
+ }
inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -299,22 +320,22 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)le64_to_cpu(fe->i_blkno));
- inode->i_nlink = ocfs2_read_links_count(fe);
+ set_nlink(inode, ocfs2_read_links_count(fe));
+ trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+ le32_to_cpu(fe->i_flags));
if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
inode->i_flags |= S_NOQUOTA;
}
-
+
if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
- mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
inode->i_flags |= S_NOQUOTA;
} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
- mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
/* we can't actually hit this as read_inode can't
* handle superblocks today ;-) */
BUG();
@@ -336,12 +357,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
else
inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
+ OCFS2_I(inode)->ip_dir_lock_gen = 1;
break;
case S_IFLNK:
- if (ocfs2_inode_is_fast_symlink(inode))
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
- else
- inode->i_op = &ocfs2_symlink_inode_operations;
+ inode->i_op = &ocfs2_symlink_inode_operations;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
@@ -377,7 +396,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
OCFS2_I(inode)->ip_last_used_slot = 0;
OCFS2_I(inode)->ip_last_used_group = 0;
- mlog_exit_void();
+
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+ OCFS2_RESV_FLAG_DIR);
}
static int ocfs2_read_locked_inode(struct inode *inode,
@@ -390,22 +412,10 @@ static int ocfs2_read_locked_inode(struct inode *inode,
int status, can_lock;
u32 generation = 0;
- mlog_entry("(0x%p, 0x%p)\n", inode, args);
-
status = -EINVAL;
- if (inode == NULL || inode->i_sb == NULL) {
- mlog(ML_ERROR, "bad inode\n");
- return status;
- }
sb = inode->i_sb;
osb = OCFS2_SB(sb);
- if (!args) {
- mlog(ML_ERROR, "bad inode args\n");
- make_bad_inode(inode);
- return status;
- }
-
/*
* To improve performance of cold-cache inode stats, we take
* the cluster lock here if possible.
@@ -430,7 +440,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
* #1 and #2 can be simply solved by never taking the lock
* here for system files (which are the only type we read
* during mount). It's a heavier approach, but our main
- * concern is user-accesible files anyway.
+ * concern is user-accessible files anyway.
*
* #3 works itself out because we'll eventually take the
* cluster lock before trusting anything anyway.
@@ -439,6 +449,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
+ trace_ocfs2_read_locked_inode(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
+
/*
* To maintain backwards compatibility with older versions of
* ocfs2-tools, we still store the generation value for system
@@ -485,7 +498,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
- if (!status)
+ /*
+ * If buffer is in jbd, then its checksum may not have been
+ * computed as yet.
+ */
+ if (!status && !buffer_jbd(bh))
status = ocfs2_validate_inode_block(osb->sb, bh);
}
if (status < 0) {
@@ -526,7 +543,6 @@ bail:
if (args && bh)
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -540,12 +556,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct buffer_head *fe_bh)
{
int status = 0;
- struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
handle_t *handle = NULL;
- mlog_entry_void();
-
fe = (struct ocfs2_dinode *) fe_bh->b_data;
/*
@@ -559,6 +572,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
+ handle = NULL;
mlog_errno(status);
goto out;
}
@@ -582,13 +596,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
ocfs2_commit_trans(osb, handle);
handle = NULL;
- status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
-
- status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+ status = ocfs2_commit_truncate(osb, inode, fe_bh);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -598,7 +606,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
out:
if (handle)
ocfs2_commit_trans(osb, handle);
- mlog_exit(status);
return status;
}
@@ -640,11 +647,13 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_unlock;
}
- status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
+ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+ orphan_dir_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
}
/* set the inodes dtime */
@@ -657,12 +666,7 @@ static int ocfs2_remove_inode(struct inode *inode,
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
-
- status = ocfs2_journal_dirty(handle, di_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail_commit;
- }
+ ocfs2_journal_dirty(handle, di_bh);
ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
dquot_free_inode(inode);
@@ -697,8 +701,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
spin_lock(&osb->osb_lock);
if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
- mlog(0, "Recovery is happening on orphan dir %d, will skip "
- "this inode\n", slot);
ret = -EDEADLK;
goto out;
}
@@ -707,6 +709,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
osb->osb_orphan_wipes[slot]++;
out:
spin_unlock(&osb->osb_lock);
+ trace_ocfs2_check_orphan_recovery_state(slot, ret);
return ret;
}
@@ -723,38 +726,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
static int ocfs2_wipe_inode(struct inode *inode,
struct buffer_head *di_bh)
{
- int status, orphaned_slot;
+ int status, orphaned_slot = -1;
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *di;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
- di = (struct ocfs2_dinode *) di_bh->b_data;
- orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+ if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
- status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
- if (status)
- return status;
+ status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+ if (status)
+ return status;
- orphan_dir_inode = ocfs2_get_system_file_inode(osb,
- ORPHAN_DIR_SYSTEM_INODE,
- orphaned_slot);
- if (!orphan_dir_inode) {
- status = -EEXIST;
- mlog_errno(status);
- goto bail;
- }
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ orphaned_slot);
+ if (!orphan_dir_inode) {
+ status = -EEXIST;
+ mlog_errno(status);
+ goto bail;
+ }
- /* Lock the orphan dir. The lock will be held for the entire
- * delete_inode operation. We do this now to avoid races with
- * recovery completion on other nodes. */
- mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
- if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ /* Lock the orphan dir. The lock will be held for the entire
+ * delete_inode operation. We do this now to avoid races with
+ * recovery completion on other nodes. */
+ mutex_lock(&orphan_dir_inode->i_mutex);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
- mlog_errno(status);
- goto bail;
+ mlog_errno(status);
+ goto bail;
+ }
}
/* we do this while holding the orphan dir lock because we
@@ -795,6 +799,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
mlog_errno(status);
bail_unlock_dir:
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+ return status;
+
ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
brelse(orphan_dir_bh);
@@ -813,6 +820,10 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+ (unsigned long long)oi->ip_blkno,
+ oi->ip_flags);
+
/* We shouldn't be getting here for the root directory
* inode.. */
if (inode == osb->root_inode) {
@@ -820,16 +831,15 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
- /* If we're coming from downconvert_thread we can't go into our own
- * voting [hello, deadlock city!], so unforuntately we just
- * have to skip deleting this guy. That's OK though because
- * the node who's doing the actual deleting should handle it
- * anyway. */
- if (current == osb->dc_task) {
- mlog(0, "Skipping delete of %lu because we're currently "
- "in downconvert\n", inode->i_ino);
+ /*
+ * If we're coming from downconvert_thread we can't go into our own
+ * voting [hello, deadlock city!] so we cannot delete the inode. But
+ * since we dropped last inode ref when downconverting dentry lock,
+ * we cannot have the file open and thus the node doing unlink will
+ * take care of deleting the inode.
+ */
+ if (current == osb->dc_task)
goto bail;
- }
spin_lock(&oi->ip_lock);
/* OCFS2 *never* deletes system files. This should technically
@@ -841,15 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
- /* If we have allowd wipe of this inode for another node, it
- * will be marked here so we can safely skip it. Recovery will
- * cleanup any inodes we might inadvertantly skip here. */
- if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
- mlog(0, "Skipping delete of %lu because another node "
- "has done this for us.\n", inode->i_ino);
- goto bail_unlock;
- }
-
ret = 1;
bail_unlock:
spin_unlock(&oi->ip_lock);
@@ -865,32 +866,45 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
struct buffer_head *di_bh,
int *wipe)
{
- int status = 0;
+ int status = 0, reason = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di;
*wipe = 0;
+ trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+ inode->i_nlink);
+
/* While we were waiting for the cluster lock in
* ocfs2_delete_inode, another node might have asked to delete
* the inode. Recheck our flags to catch this. */
if (!ocfs2_inode_is_valid_to_delete(inode)) {
- mlog(0, "Skipping delete of %llu because flags changed\n",
- (unsigned long long)oi->ip_blkno);
+ reason = 1;
goto bail;
}
/* Now that we have an up to date inode, we can double check
* the link count. */
- if (inode->i_nlink) {
- mlog(0, "Skipping delete of %llu because nlink = %u\n",
- (unsigned long long)oi->ip_blkno, inode->i_nlink);
+ if (inode->i_nlink)
goto bail;
- }
/* Do some basic inode verification... */
di = (struct ocfs2_dinode *) di_bh->b_data;
- if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+ if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+ !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
+ /*
+ * Inodes in the orphan dir must have ORPHANED_FL. The only
+ * inodes that come back out of the orphan dir are reflink
+ * targets. A reflink target may be moved out of the orphan
+ * dir between the time we scan the directory and the time we
+ * process it. This would lead to HAS_REFCOUNT_FL being set but
+ * ORPHANED_FL not.
+ */
+ if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+ reason = 2;
+ goto bail;
+ }
+
/* for lack of a better error? */
status = -EEXIST;
mlog(ML_ERROR,
@@ -915,7 +929,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
* the inode open lock in ocfs2_read_locked_inode(). When we
* get to ->delete_inode(), each node tries to convert it's
* lock to an exclusive. Trylocks are serialized by the inode
- * meta data lock. If the upconvert suceeds, we know the inode
+ * meta data lock. If the upconvert succeeds, we know the inode
* is no longer live and can be deleted.
*
* Though we call this with the meta data lock held, the
@@ -924,8 +938,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
status = ocfs2_try_open_lock(inode, 1);
if (status == -EAGAIN) {
status = 0;
- mlog(0, "Skipping delete of %llu because it is in use on "
- "other nodes\n", (unsigned long long)oi->ip_blkno);
+ reason = 3;
goto bail;
}
if (status < 0) {
@@ -934,11 +947,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
}
*wipe = 1;
- mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
- (unsigned long long)oi->ip_blkno,
- le16_to_cpu(di->i_orphaned_slot));
+ trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
bail:
+ trace_ocfs2_query_inode_wipe_end(status, reason);
return status;
}
@@ -948,30 +960,28 @@ bail:
static void ocfs2_cleanup_delete_inode(struct inode *inode,
int sync_data)
{
- mlog(0, "Cleanup inode %llu, sync = %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+ trace_ocfs2_cleanup_delete_inode(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
- write_inode_now(inode, 1);
- truncate_inode_pages(&inode->i_data, 0);
+ filemap_write_and_wait(inode->i_mapping);
+ truncate_inode_pages_final(&inode->i_data);
}
-void ocfs2_delete_inode(struct inode *inode)
+static void ocfs2_delete_inode(struct inode *inode)
{
int wipe, status;
- sigset_t blocked, oldset;
+ sigset_t oldset;
struct buffer_head *di_bh = NULL;
- mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+ trace_ocfs2_delete_inode(inode->i_ino,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ is_bad_inode(inode));
/* When we fail in read_inode() we mark inode as bad. The second test
* catches the case when inode allocation fails before allocating
* a block for inode. */
- if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
- mlog(0, "Skipping delete of bad inode\n");
+ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
goto bail;
- }
-
- dquot_initialize(inode);
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
@@ -981,17 +991,13 @@ void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
/* We want to block signals in delete_inode as the lock and
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned
* forever. */
- sigfillset(&blocked);
- status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
- if (status < 0) {
- mlog_errno(status);
- ocfs2_cleanup_delete_inode(inode, 1);
- goto bail;
- }
+ ocfs2_block_signals(&oldset);
/*
* Synchronize us against ocfs2_get_dentry. We take this in
@@ -1065,26 +1071,20 @@ bail_unlock_nfs_sync:
ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
bail_unblock:
- status = sigprocmask(SIG_SETMASK, &oldset, NULL);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_unblock_signals(&oldset);
bail:
- clear_inode(inode);
- mlog_exit_void();
+ return;
}
-void ocfs2_clear_inode(struct inode *inode)
+static void ocfs2_clear_inode(struct inode *inode)
{
int status;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry_void();
-
- if (!inode)
- goto bail;
-
- mlog(0, "Clearing inode: %llu, nlink = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+ clear_inode(inode);
+ trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+ inode->i_nlink);
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
@@ -1097,9 +1097,13 @@ void ocfs2_clear_inode(struct inode *inode)
/* Do these before all the other work so that we don't bounce
* the downconvert thread while waiting to destroy the locks. */
- ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
+
+ ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+ &oi->ip_la_data_resv);
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
@@ -1169,29 +1173,36 @@ void ocfs2_clear_inode(struct inode *inode)
*/
jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
&oi->ip_jinode);
+}
-bail:
- mlog_exit_void();
+void ocfs2_evict_inode(struct inode *inode)
+{
+ if (!inode->i_nlink ||
+ (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
+ ocfs2_delete_inode(inode);
+ } else {
+ truncate_inode_pages_final(&inode->i_data);
+ }
+ ocfs2_clear_inode(inode);
}
/* Called under inode_lock, with no more references on the
* struct inode, so it's safe here to check the flags field
* and to manipulate i_nlink without any other locks. */
-void ocfs2_drop_inode(struct inode *inode)
+int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ int res;
- mlog_entry_void();
-
- mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
- (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+ trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+ inode->i_nlink, oi->ip_flags);
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- generic_delete_inode(inode);
+ res = 1;
else
- generic_drop_inode(inode);
+ res = generic_drop_inode(inode);
- mlog_exit_void();
+ return res;
}
/*
@@ -1202,11 +1213,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int status = 0;
- mlog_entry("(inode = 0x%p, ino = %llu)\n", inode,
- inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+ trace_ocfs2_inode_revalidate(inode,
+ inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+ inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
if (!inode) {
- mlog(0, "eep, no inode!\n");
status = -ENOENT;
goto bail;
}
@@ -1214,7 +1225,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
spin_lock(&OCFS2_I(inode)->ip_lock);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
spin_unlock(&OCFS2_I(inode)->ip_lock);
- mlog(0, "inode deleted!\n");
status = -ENOENT;
goto bail;
}
@@ -1230,8 +1240,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
}
ocfs2_inode_unlock(inode, 0);
bail:
- mlog_exit(status);
-
return status;
}
@@ -1247,8 +1255,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
int status;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
- mlog_entry("(inode %llu)\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1266,8 +1273,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_size = cpu_to_le64(i_size_read(inode));
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
@@ -1276,14 +1283,9 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
-
- status = 0;
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
-
- mlog_exit(status);
return status;
}
@@ -1302,9 +1304,9 @@ void ocfs2_refresh_inode(struct inode *inode,
OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
ocfs2_set_inode_flags(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
- inode->i_nlink = ocfs2_read_links_count(fe);
- inode->i_uid = le32_to_cpu(fe->i_uid);
- inode->i_gid = le32_to_cpu(fe->i_gid);
+ set_nlink(inode, ocfs2_read_links_count(fe));
+ i_uid_write(inode, le32_to_cpu(fe->i_uid));
+ i_gid_write(inode, le32_to_cpu(fe->i_gid));
inode->i_mode = le16_to_cpu(fe->i_mode);
if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
inode->i_blocks = 0;
@@ -1326,8 +1328,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
int rc;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
- mlog(0, "Validating dinode %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293..a6c991c0fc9 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,33 +43,43 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
+ /* Number of outstanding AIO's which are not page aligned */
+ struct mutex ip_unaligned_aio;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
- u32 ip_clusters;
struct list_head ip_io_markers;
+ u32 ip_clusters;
+ u16 ip_dyn_features;
struct mutex ip_io_mutex;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
- u16 ip_dyn_features;
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
- u32 ip_dir_start_lookup;
-
struct ocfs2_caching_info ip_metadata_cache;
-
struct ocfs2_extent_map ip_extent_map;
-
struct inode vfs_inode;
struct jbd2_inode ip_jinode;
+ u32 ip_dir_start_lookup;
+
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
+ u32 ip_dir_lock_gen;
+
+ struct ocfs2_alloc_reservation ip_la_data_resv;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -81,8 +91,6 @@ struct ocfs2_inode_info
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE 0x00000010
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -97,9 +105,11 @@ struct ocfs2_inode_info
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
-#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
+#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
/* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT 0x00000040
+#define OCFS2_INODE_OPEN_DIRECT 0x00000020
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
@@ -119,9 +129,8 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
return &OCFS2_I(inode)->ip_metadata_cache;
}
-void ocfs2_clear_inode(struct inode *inode);
-void ocfs2_delete_inode(struct inode *inode);
-void ocfs2_drop_inode(struct inode *inode);
+void ocfs2_evict_inode(struct inode *inode);
+int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
@@ -143,8 +152,6 @@ void ocfs2_refresh_inode(struct inode *inode,
int ocfs2_mark_inode_dirty(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
struct buffer_head *ocfs2_bread(struct inode *inode,
int block, int *err, int reada);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132ce..6f66b3751ac 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,9 +7,9 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/blkdev.h>
#include <linux/compat.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -23,8 +23,43 @@
#include "ioctl.h"
#include "resize.h"
#include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
-#include <linux/ext2_fs.h>
+#define o2info_from_user(a, b) \
+ copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b) \
+ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT. The error will be returned from the ioctl(2) call. It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
+ struct ocfs2_info_request __user *req)
+{
+ kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+ (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+ req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+ req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+ return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
@@ -39,7 +74,6 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
*flags = OCFS2_I(inode)->ip_attr;
ocfs2_inode_unlock(inode, 0);
- mlog_exit(status);
return status;
}
@@ -62,19 +96,12 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
}
status = -EACCES;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
goto bail_unlock;
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
- }
-
oldflags = ocfs2_inode->ip_attr;
flags = flags & mask;
flags |= oldflags & ~mask;
@@ -90,6 +117,13 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail_unlock;
}
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
+
ocfs2_inode->ip_attr = flags;
ocfs2_set_inode_flags(inode);
@@ -98,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
mlog_errno(status);
ocfs2_commit_trans(osb, handle);
+
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
@@ -105,21 +140,764 @@ bail:
brelse(bh);
- mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_blocksize oib;
+
+ if (o2info_from_user(oib, req))
+ goto bail;
+
+ oib.ib_blocksize = inode->i_sb->s_blocksize;
+
+ o2info_set_request_filled(&oib.ib_req);
+
+ if (o2info_to_user(oib, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oib.ib_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_clustersize oic;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oic, req))
+ goto bail;
+
+ oic.ic_clustersize = osb->s_clustersize;
+
+ o2info_set_request_filled(&oic.ic_req);
+
+ if (o2info_to_user(oic, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oic.ic_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_maxslots oim;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oim, req))
+ goto bail;
+
+ oim.im_max_slots = osb->max_slots;
+
+ o2info_set_request_filled(&oim.im_req);
+
+ if (o2info_to_user(oim, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oim.im_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_label oil;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oil, req))
+ goto bail;
+
+ memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+
+ o2info_set_request_filled(&oil.il_req);
+
+ if (o2info_to_user(oil, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oil.il_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_uuid oiu;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oiu, req))
+ goto bail;
+
+ memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+
+ o2info_set_request_filled(&oiu.iu_req);
+
+ if (o2info_to_user(oiu, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiu.iu_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_fs_features oif;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oif, req))
+ goto bail;
+
+ oif.if_compat_features = osb->s_feature_compat;
+ oif.if_incompat_features = osb->s_feature_incompat;
+ oif.if_ro_compat_features = osb->s_feature_ro_compat;
+
+ o2info_set_request_filled(&oif.if_req);
+
+ if (o2info_to_user(oif, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oif.if_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_journal_size oij;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oij, req))
+ goto bail;
+
+ oij.ij_journal_size = i_size_read(osb->journal->j_inode);
+
+ o2info_set_request_filled(&oij.ij_req);
+
+ if (o2info_to_user(oij, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oij.ij_req, req);
+
+ return status;
+}
+
+static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+ struct inode *inode_alloc, u64 blkno,
+ struct ocfs2_info_freeinode *fi,
+ u32 slot)
+{
+ int status = 0, unlock = 0;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *dinode_alloc = NULL;
+
+ if (inode_alloc)
+ mutex_lock(&inode_alloc->i_mutex);
+
+ if (o2info_coherent(&fi->ifi_req)) {
+ status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+ fi->ifi_stat[slot].lfi_total =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+ fi->ifi_stat[slot].lfi_free =
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(inode_alloc, 0);
+
+ if (inode_alloc)
+ mutex_unlock(&inode_alloc->i_mutex);
+
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_freeinode(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u32 i;
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+ struct ocfs2_info_freeinode *oifi = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *inode_alloc = NULL;
+
+ oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+ if (!oifi) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ if (o2info_from_user(*oifi, req))
+ goto bail;
+
+ oifi->ifi_slotnum = osb->max_slots;
+
+ for (i = 0; i < oifi->ifi_slotnum; i++) {
+ if (o2info_coherent(&oifi->ifi_req)) {
+ inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+ if (!inode_alloc) {
+ mlog(ML_ERROR, "unable to get alloc inode in "
+ "slot %u\n", i);
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf,
+ sizeof(namebuf),
+ type, i);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+
+ iput(inode_alloc);
+ inode_alloc = NULL;
+
+ if (status < 0)
+ goto bail;
+ }
+
+ o2info_set_request_filled(&oifi->ifi_req);
+
+ if (o2info_to_user(*oifi, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oifi->ifi_req, req);
+
+ kfree(oifi);
+out_err:
+ return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+ unsigned int chunksize)
+{
+ int index;
+
+ index = __ilog2_u32(chunksize);
+ if (index >= OCFS2_INFO_MAX_HIST)
+ index = OCFS2_INFO_MAX_HIST - 1;
+
+ hist->fc_chunks[index]++;
+ hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+ unsigned int chunksize)
+{
+ if (chunksize > stats->ffs_max)
+ stats->ffs_max = chunksize;
+
+ if (chunksize < stats->ffs_min)
+ stats->ffs_min = chunksize;
+
+ stats->ffs_avg += chunksize;
+ stats->ffs_free_chunks_real++;
+}
+
+static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+ unsigned int chunksize)
+{
+ o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+ o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+ struct inode *gb_inode,
+ struct ocfs2_dinode *gb_dinode,
+ struct ocfs2_chain_rec *rec,
+ struct ocfs2_info_freefrag *ffg,
+ u32 chunks_in_group)
+{
+ int status = 0, used;
+ u64 blkno;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_group_desc *bg = NULL;
+
+ unsigned int max_bits, num_clusters;
+ unsigned int offset = 0, cluster, chunk;
+ unsigned int chunk_free, last_chunksize = 0;
+
+ if (!le32_to_cpu(rec->c_free))
+ goto bail;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (bh) {
+ brelse(bh);
+ bh = NULL;
+ }
+
+ if (o2info_coherent(&ffg->iff_req))
+ status = ocfs2_read_group_descriptor(gb_inode,
+ gb_dinode,
+ blkno, &bh);
+ else
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+ if (status < 0) {
+ mlog(ML_ERROR, "Can't read the group descriptor # "
+ "%llu from device.", (unsigned long long)blkno);
+ status = -EIO;
+ goto bail;
+ }
+
+ bg = (struct ocfs2_group_desc *)bh->b_data;
+
+ if (!le16_to_cpu(bg->bg_free_bits_count))
+ continue;
+
+ max_bits = le16_to_cpu(bg->bg_bits);
+ offset = 0;
+
+ for (chunk = 0; chunk < chunks_in_group; chunk++) {
+ /*
+ * last chunk may be not an entire one.
+ */
+ if ((offset + ffg->iff_chunksize) > max_bits)
+ num_clusters = max_bits - offset;
+ else
+ num_clusters = ffg->iff_chunksize;
+
+ chunk_free = 0;
+ for (cluster = 0; cluster < num_clusters; cluster++) {
+ used = ocfs2_test_bit(offset,
+ (unsigned long *)bg->bg_bitmap);
+ /*
+ * - chunk_free counts free clusters in #N chunk.
+ * - last_chunksize records the size(in) clusters
+ * for the last real free chunk being counted.
+ */
+ if (!used) {
+ last_chunksize++;
+ chunk_free++;
+ }
+
+ if (used && last_chunksize) {
+ ocfs2_info_update_ffg(ffg,
+ last_chunksize);
+ last_chunksize = 0;
+ }
+
+ offset++;
+ }
+
+ if (chunk_free == ffg->iff_chunksize)
+ ffg->iff_ffs.ffs_free_chunks++;
+ }
+
+ /*
+ * need to update the info for last free chunk.
+ */
+ if (last_chunksize)
+ ocfs2_info_update_ffg(ffg, last_chunksize);
+
+ } while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+ struct inode *gb_inode, u64 blkno,
+ struct ocfs2_info_freefrag *ffg)
+{
+ u32 chunks_in_group;
+ int status = 0, unlock = 0, i;
+
+ struct buffer_head *bh = NULL;
+ struct ocfs2_chain_list *cl = NULL;
+ struct ocfs2_chain_rec *rec = NULL;
+ struct ocfs2_dinode *gb_dinode = NULL;
+
+ if (gb_inode)
+ mutex_lock(&gb_inode->i_mutex);
+
+ if (o2info_coherent(&ffg->iff_req)) {
+ status = ocfs2_inode_lock(gb_inode, &bh, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ unlock = 1;
+ } else {
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+ cl = &(gb_dinode->id2.i_chain);
+
+ /*
+ * Chunksize(in) clusters from userspace should be
+ * less than clusters in a group.
+ */
+ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+ ffg->iff_ffs.ffs_min = ~0U;
+ ffg->iff_ffs.ffs_clusters =
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+ ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+ le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+ chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+ rec = &(cl->cl_recs[i]);
+ status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+ gb_dinode,
+ rec, ffg,
+ chunks_in_group);
+ if (status)
+ goto bail;
+ }
+
+ if (ffg->iff_ffs.ffs_free_chunks_real)
+ ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+ ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+ if (unlock)
+ ocfs2_inode_unlock(gb_inode, 0);
+
+ if (gb_inode)
+ mutex_unlock(&gb_inode->i_mutex);
+
+ if (gb_inode)
+ iput(gb_inode);
+
+ brelse(bh);
+
+ return status;
+}
+
+static int ocfs2_info_handle_freefrag(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ u64 blkno = -1;
+ char namebuf[40];
+ int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+ struct ocfs2_info_freefrag *oiff;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *gb_inode = NULL;
+
+ oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+ if (!oiff) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ if (o2info_from_user(*oiff, req))
+ goto bail;
+ /*
+ * chunksize from userspace should be power of 2.
+ */
+ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+ (!oiff->iff_chunksize)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (o2info_coherent(&oiff->iff_req)) {
+ gb_inode = ocfs2_get_system_file_inode(osb, type,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ status = -EIO;
+ goto bail;
+ }
+ } else {
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+ OCFS2_INVALID_SLOT);
+ status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+ namebuf,
+ strlen(namebuf),
+ &blkno);
+ if (status < 0) {
+ status = -ENOENT;
+ goto bail;
+ }
+ }
+
+ status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+ if (status < 0)
+ goto bail;
+
+ o2info_set_request_filled(&oiff->iff_req);
+
+ if (o2info_to_user(*oiff, req)) {
+ status = -EFAULT;
+ goto bail;
+ }
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oiff->iff_req, req);
+
+ kfree(oiff);
+out_err:
+ return status;
+}
+
+static int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ o2info_clear_request_filled(&oir);
+
+ if (o2info_to_user(oir, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(&oir, req);
+
+ return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+static int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ status = -EINVAL;
+ if (oir.ir_magic != OCFS2_INFO_MAGIC)
+ goto bail;
+
+ switch (oir.ir_code) {
+ case OCFS2_INFO_BLOCKSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+ status = ocfs2_info_handle_blocksize(inode, req);
+ break;
+ case OCFS2_INFO_CLUSTERSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+ status = ocfs2_info_handle_clustersize(inode, req);
+ break;
+ case OCFS2_INFO_MAXSLOTS:
+ if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+ status = ocfs2_info_handle_maxslots(inode, req);
+ break;
+ case OCFS2_INFO_LABEL:
+ if (oir.ir_size == sizeof(struct ocfs2_info_label))
+ status = ocfs2_info_handle_label(inode, req);
+ break;
+ case OCFS2_INFO_UUID:
+ if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+ status = ocfs2_info_handle_uuid(inode, req);
+ break;
+ case OCFS2_INFO_FS_FEATURES:
+ if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+ status = ocfs2_info_handle_fs_features(inode, req);
+ break;
+ case OCFS2_INFO_JOURNAL_SIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+ status = ocfs2_info_handle_journal_size(inode, req);
+ break;
+ case OCFS2_INFO_FREEINODE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+ status = ocfs2_info_handle_freeinode(inode, req);
+ break;
+ case OCFS2_INFO_FREEFRAG:
+ if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+ status = ocfs2_info_handle_freefrag(inode, req);
+ break;
+ default:
+ status = ocfs2_info_handle_unknown(inode, req);
+ break;
+ }
+
+bail:
+ return status;
+}
+
+static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
+{
+ int status = -EFAULT;
+ u64 __user *bp = NULL;
+
+ if (compat_flag) {
+#ifdef CONFIG_COMPAT
+ /*
+ * pointer bp stores the base address of a pointers array,
+ * which collects all addresses of separate request.
+ */
+ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+ BUG();
+#endif
+ } else
+ bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+ if (o2info_from_user(*req_addr, bp + idx))
+ goto bail;
+
+ status = 0;
+bail:
+ return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
+{
+ int i, status = 0;
+ u64 req_addr;
+ struct ocfs2_info_request __user *reqp;
+
+ if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+ (!info->oi_requests)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ for (i = 0; i < info->oi_count; i++) {
+
+ status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+ if (status)
+ break;
+
+ reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
+ if (!reqp) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_info_handle_request(inode, reqp);
+ if (status)
+ break;
+ }
+
+bail:
return status;
}
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
unsigned int flags;
int new_clusters;
int status;
struct ocfs2_space_resv sr;
struct ocfs2_new_group_input input;
struct reflink_arguments args;
- const char *old_path, *new_path;
+ const char __user *old_path;
+ const char __user *new_path;
bool preserve;
+ struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -133,12 +911,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- status = mnt_want_write(filp->f_path.mnt);
+ status = mnt_want_write_file(filp);
if (status)
return status;
status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
- mnt_drop_write(filp->f_path.mnt);
+ mnt_drop_write_file(filp);
return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
@@ -155,7 +933,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(new_clusters, (int __user *)arg))
return -EFAULT;
- return ocfs2_group_extend(inode, new_clusters);
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_extend(inode, new_clusters);
+ mnt_drop_write_file(filp);
+ return status;
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
if (!capable(CAP_SYS_RESOURCE))
@@ -164,16 +947,54 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
return -EFAULT;
- return ocfs2_group_add(inode, &input);
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+ status = ocfs2_group_add(inode, &input);
+ mnt_drop_write_file(filp);
+ return status;
case OCFS2_IOC_REFLINK:
- if (copy_from_user(&args, (struct reflink_arguments *)arg,
- sizeof(args)))
+ if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
- old_path = (const char *)(unsigned long)args.old_path;
- new_path = (const char *)(unsigned long)args.new_path;
+ old_path = (const char __user *)(unsigned long)args.old_path;
+ new_path = (const char __user *)(unsigned long)args.new_path;
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 0);
+ case FITRIM:
+ {
+ struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen);
+ ret = ocfs2_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case OCFS2_IOC_MOVE_EXT:
+ return ocfs2_ioctl_move_extents(filp, argp);
default:
return -ENOTTY;
}
@@ -184,7 +1005,9 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
bool preserve;
struct reflink_arguments args;
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
+ struct ocfs2_info info;
+ void __user *argp = (void __user *)arg;
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
@@ -200,15 +1023,22 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ case FITRIM:
break;
case OCFS2_IOC_REFLINK:
- if (copy_from_user(&args, (struct reflink_arguments *)arg,
- sizeof(args)))
+ if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
compat_ptr(args.new_path), preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 1);
+ case OCFS2_IOC_MOVE_EXT:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a3..4b0c68849b3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -30,8 +30,8 @@
#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/random.h>
+#include <linux/delay.h>
-#define MLOG_MASK_PREFIX ML_JOURNAL
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -52,6 +52,7 @@
#include "quota.h"
#include "buffer_head_io.h"
+#include "ocfs2_trace.h"
DEFINE_SPINLOCK(trans_inc_lock);
@@ -301,19 +302,17 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
unsigned int flushed;
- unsigned long old_id;
struct ocfs2_journal *journal = NULL;
- mlog_entry_void();
-
journal = osb->journal;
/* Flush all pending commits and checkpoint the journal. */
down_write(&journal->j_trans_barrier);
- if (atomic_read(&journal->j_num_trans) == 0) {
+ flushed = atomic_read(&journal->j_num_trans);
+ trace_ocfs2_commit_cache_begin(flushed);
+ if (flushed == 0) {
up_write(&journal->j_trans_barrier);
- mlog(0, "No transactions for me to flush!\n");
goto finally;
}
@@ -326,25 +325,20 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- old_id = ocfs2_inc_trans_id(journal);
+ ocfs2_inc_trans_id(journal);
flushed = atomic_read(&journal->j_num_trans);
atomic_set(&journal->j_num_trans, 0);
up_write(&journal->j_trans_barrier);
- mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
- journal->j_trans_id, flushed);
+ trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
ocfs2_wake_downconvert_thread(osb);
wake_up(&journal->j_checkpointed);
finally:
- mlog_exit(status);
return status;
}
-/* pass it NULL and it will allocate a new handle object for you. If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{
journal_t *journal = osb->journal->j_journal;
@@ -362,11 +356,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
if (journal_current_handle())
return jbd2_journal_start(journal, max_buffs);
+ sb_start_intwrite(osb->sb);
+
down_read(&osb->journal->j_trans_barrier);
handle = jbd2_journal_start(journal, max_buffs);
if (IS_ERR(handle)) {
up_read(&osb->journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
mlog_errno(PTR_ERR(handle));
@@ -395,16 +392,16 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
if (ret < 0)
mlog_errno(ret);
- if (!nested)
+ if (!nested) {
up_read(&journal->j_trans_barrier);
+ sb_end_intwrite(osb->sb);
+ }
return ret;
}
/*
- * 'nblocks' is what you want to add to the current
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
+ * 'nblocks' is what you want to add to the current transaction.
*
* This might call jbd2_journal_restart() which will commit dirty buffers
* and then restart the transaction. Before calling
@@ -422,14 +419,17 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
*/
int ocfs2_extend_trans(handle_t *handle, int nblocks)
{
- int status;
+ int status, old_nblocks;
BUG_ON(!handle);
- BUG_ON(!nblocks);
+ BUG_ON(nblocks < 0);
- mlog_entry_void();
+ if (!nblocks)
+ return 0;
+
+ old_nblocks = handle->h_buffer_credits;
- mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+ trace_ocfs2_extend_trans(old_nblocks, nblocks);
#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
@@ -442,10 +442,9 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
#endif
if (status > 0) {
- mlog(0,
- "jbd2_journal_extend failed, trying "
- "jbd2_journal_restart\n");
- status = jbd2_journal_restart(handle, nblocks);
+ trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
+ status = jbd2_journal_restart(handle,
+ old_nblocks + nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -454,11 +453,44 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
status = 0;
bail:
+ return status;
+}
+
+/*
+ * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for metadata modifications.
+ * Taken from Ext4: extend_or_restart_transaction()
+ */
+int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
+{
+ int status, old_nblks;
+
+ BUG_ON(!handle);
+
+ old_nblks = handle->h_buffer_credits;
+ trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
+
+ if (old_nblks < thresh)
+ return 0;
+
+ status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ if (status > 0) {
+ status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
+ if (status < 0)
+ mlog_errno(status);
+ }
- mlog_exit(status);
+bail:
return status;
}
+
struct ocfs2_triggers {
struct jbd2_buffer_trigger_type ot_triggers;
int ot_offset;
@@ -469,7 +501,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
return container_of(triggers, struct ocfs2_triggers, ot_triggers);
}
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -488,7 +520,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Quota blocks have their own trigger because the struct ocfs2_block_check
* offset depends on the blocksize.
*/
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -508,7 +540,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
* Directory blocks also have their own trigger because the
* struct ocfs2_block_check offset depends on the blocksize.
*/
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size)
{
@@ -541,7 +573,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
static struct ocfs2_triggers di_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dinode, i_check),
@@ -549,7 +581,7 @@ static struct ocfs2_triggers di_triggers = {
static struct ocfs2_triggers eb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_extent_block, h_check),
@@ -557,7 +589,7 @@ static struct ocfs2_triggers eb_triggers = {
static struct ocfs2_triggers rb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
@@ -565,7 +597,7 @@ static struct ocfs2_triggers rb_triggers = {
static struct ocfs2_triggers gd_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
@@ -573,14 +605,14 @@ static struct ocfs2_triggers gd_triggers = {
static struct ocfs2_triggers db_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_db_commit_trigger,
+ .t_frozen = ocfs2_db_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers xb_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
@@ -588,14 +620,14 @@ static struct ocfs2_triggers xb_triggers = {
static struct ocfs2_triggers dq_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_dq_commit_trigger,
+ .t_frozen = ocfs2_dq_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
};
static struct ocfs2_triggers dr_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -603,7 +635,7 @@ static struct ocfs2_triggers dr_triggers = {
static struct ocfs2_triggers dl_triggers = {
.ot_triggers = {
- .t_commit = ocfs2_commit_trigger,
+ .t_frozen = ocfs2_frozen_trigger,
.t_abort = ocfs2_abort_trigger,
},
.ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check),
@@ -623,12 +655,9 @@ static int __ocfs2_journal_access(handle_t *handle,
BUG_ON(!handle);
BUG_ON(!bh);
- mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
- (unsigned long long)bh->b_blocknr, type,
- (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
- "OCFS2_JOURNAL_ACCESS_CREATE" :
- "OCFS2_JOURNAL_ACCESS_WRITE",
- bh->b_size);
+ trace_ocfs2_journal_access(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr, type, bh->b_size);
/* we can safely remove this assertion after testing. */
if (!buffer_uptodate(bh)) {
@@ -669,7 +698,6 @@ static int __ocfs2_journal_access(handle_t *handle,
mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
status, type);
- mlog_exit(status);
return status;
}
@@ -734,22 +762,14 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
return __ocfs2_journal_access(handle, ci, bh, NULL, type);
}
-int ocfs2_journal_dirty(handle_t *handle,
- struct buffer_head *bh)
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
{
int status;
- mlog_entry("(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
status = jbd2_journal_dirty_metadata(handle, bh);
- if (status < 0)
- mlog(ML_ERROR, "Could not dirty metadata buffer. "
- "(bh->b_blocknr=%llu)\n",
- (unsigned long long)bh->b_blocknr);
-
- mlog_exit(status);
- return status;
+ BUG_ON(status);
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -762,13 +782,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
if (osb->osb_commit_interval)
commit_interval = osb->osb_commit_interval;
- spin_lock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- spin_unlock(&journal->j_state_lock);
+ write_unlock(&journal->j_state_lock);
}
int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
@@ -781,8 +801,6 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
struct ocfs2_super *osb;
int inode_lock = 0;
- mlog_entry_void();
-
BUG_ON(!journal);
osb = journal->j_osb;
@@ -819,17 +837,16 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
inode_lock = 1;
di = (struct ocfs2_dinode *)bh->b_data;
- if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
+ if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) {
mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
- inode->i_size);
+ i_size_read(inode));
status = -EINVAL;
goto done;
}
- mlog(0, "inode->i_size = %lld\n", inode->i_size);
- mlog(0, "inode->i_blocks = %llu\n",
- (unsigned long long)inode->i_blocks);
- mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+ trace_ocfs2_journal_init(i_size_read(inode),
+ (unsigned long long)inode->i_blocks,
+ OCFS2_I(inode)->ip_clusters);
/* call the kernels journal init function now */
j_journal = jbd2_journal_init_inode(inode);
@@ -839,8 +856,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
goto done;
}
- mlog(0, "Returned from jbd2_journal_init_inode\n");
- mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+ trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL);
@@ -865,7 +881,6 @@ done:
}
}
- mlog_exit(status);
return status;
}
@@ -888,8 +903,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
struct buffer_head *bh = journal->j_bh;
struct ocfs2_dinode *fe;
- mlog_entry_void();
-
fe = (struct ocfs2_dinode *)bh->b_data;
/* The journal bh on the osb always comes from ocfs2_journal_init()
@@ -912,7 +925,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
if (status < 0)
mlog_errno(status);
- mlog_exit(status);
return status;
}
@@ -927,8 +939,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
struct inode *inode = NULL;
int num_running_trans = 0;
- mlog_entry_void();
-
BUG_ON(!osb);
journal = osb->journal;
@@ -945,10 +955,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
BUG();
num_running_trans = atomic_read(&(osb->journal->j_num_trans));
- if (num_running_trans > 0)
- mlog(0, "Shutting down journal: must wait on %d "
- "running transactions!\n",
- num_running_trans);
+ trace_ocfs2_journal_shutdown(num_running_trans);
/* Do a commit_cache here. It will flush our journal, *and*
* release any locks that are still held.
@@ -961,7 +968,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
* completely destroy the journal. */
if (osb->commit_task) {
/* Wait for the commit thread */
- mlog(0, "Waiting for ocfs2commit to exit....\n");
+ trace_ocfs2_journal_shutdown_wait(osb->commit_task);
kthread_stop(osb->commit_task);
osb->commit_task = NULL;
}
@@ -1004,7 +1011,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
done:
if (inode)
iput(inode);
- mlog_exit_void();
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1030,8 +1036,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
int status = 0;
struct ocfs2_super *osb;
- mlog_entry_void();
-
BUG_ON(!journal);
osb = journal->j_osb;
@@ -1065,7 +1069,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
osb->commit_task = NULL;
done:
- mlog_exit(status);
return status;
}
@@ -1076,8 +1079,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
{
int status;
- mlog_entry_void();
-
BUG_ON(!journal);
status = jbd2_journal_wipe(journal->j_journal, full);
@@ -1091,7 +1092,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
mlog_errno(status);
bail:
- mlog_exit(status);
return status;
}
@@ -1130,11 +1130,9 @@ static int ocfs2_force_read_journal(struct inode *inode)
#define CONCURRENT_JOURNAL_FILL 32ULL
struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
- mlog_entry_void();
-
memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
- num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
+ num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
v_blkno = 0;
while (v_blkno < num_blocks) {
status = ocfs2_extent_map_get_blocks(inode, v_blkno,
@@ -1167,7 +1165,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
bail:
for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
brelse(bhs[i]);
- mlog_exit(status);
return status;
}
@@ -1191,7 +1188,7 @@ struct ocfs2_la_recovery_item {
*/
void ocfs2_complete_recovery(struct work_struct *work)
{
- int ret;
+ int ret = 0;
struct ocfs2_journal *journal =
container_of(work, struct ocfs2_journal, j_recovery_work);
struct ocfs2_super *osb = journal->j_osb;
@@ -1200,9 +1197,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_quota_recovery *qrec;
LIST_HEAD(tmp_la_list);
- mlog_entry_void();
-
- mlog(0, "completing recovery from keventd\n");
+ trace_ocfs2_complete_recovery(
+ (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
spin_lock(&journal->j_lock);
list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
@@ -1211,15 +1207,18 @@ void ocfs2_complete_recovery(struct work_struct *work)
list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
list_del_init(&item->lri_list);
- mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
-
ocfs2_wait_on_quotas(osb);
la_dinode = item->lri_la_dinode;
- if (la_dinode) {
- mlog(0, "Clean up local alloc %llu\n",
- (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
+ tl_dinode = item->lri_tl_dinode;
+ qrec = item->lri_qrec;
+
+ trace_ocfs2_complete_recovery_slot(item->lri_slot,
+ la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+ tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+ qrec);
+ if (la_dinode) {
ret = ocfs2_complete_local_alloc_recovery(osb,
la_dinode);
if (ret < 0)
@@ -1228,11 +1227,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(la_dinode);
}
- tl_dinode = item->lri_tl_dinode;
if (tl_dinode) {
- mlog(0, "Clean up truncate log %llu\n",
- (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
-
ret = ocfs2_complete_truncate_log_recovery(osb,
tl_dinode);
if (ret < 0)
@@ -1245,9 +1240,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
if (ret < 0)
mlog_errno(ret);
- qrec = item->lri_qrec;
if (qrec) {
- mlog(0, "Recovering quota files");
ret = ocfs2_finish_quota_recovery(osb, qrec,
item->lri_slot);
if (ret < 0)
@@ -1258,8 +1251,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(item);
}
- mlog(0, "Recovery completion\n");
- mlog_exit_void();
+ trace_ocfs2_complete_recovery_end(ret);
}
/* NOTE: This function always eats your references to la_dinode and
@@ -1278,11 +1270,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
/* Though we wish to avoid it, we are in fact safe in
* skipping local alloc cleanup as fsck.ocfs2 is more
* than capable of reclaiming unused space. */
- if (la_dinode)
- kfree(la_dinode);
-
- if (tl_dinode)
- kfree(tl_dinode);
+ kfree(la_dinode);
+ kfree(tl_dinode);
if (qrec)
ocfs2_free_quota_recovery(qrec);
@@ -1309,6 +1298,9 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
{
struct ocfs2_journal *journal = osb->journal;
+ if (ocfs2_is_hard_readonly(osb))
+ return;
+
/* No need to queue up our truncate_log as regular cleanup will catch
* that */
ocfs2_queue_recovery_completion(journal, osb->slot_num,
@@ -1345,8 +1337,6 @@ static int __ocfs2_recovery_thread(void *arg)
int rm_quota_used = 0, i;
struct ocfs2_quota_recovery *qrec;
- mlog_entry_void();
-
status = ocfs2_wait_on_mount(osb);
if (status < 0) {
goto bail;
@@ -1378,15 +1368,12 @@ restart:
* clear it until ocfs2_recover_node() has succeeded. */
node_num = rm->rm_entries[0];
spin_unlock(&osb->osb_lock);
- mlog(0, "checking node %d\n", node_num);
slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ trace_ocfs2_recovery_thread_node(node_num, slot_num);
if (slot_num == -ENOENT) {
status = 0;
- mlog(0, "no slot for this node, so no recovery"
- "required.\n");
goto skip_recovery;
}
- mlog(0, "node %d was using slot %d\n", node_num, slot_num);
/* It is a bit subtle with quota recovery. We cannot do it
* immediately because we have to obtain cluster locks from
@@ -1413,7 +1400,7 @@ skip_recovery:
spin_lock(&osb->osb_lock);
}
spin_unlock(&osb->osb_lock);
- mlog(0, "All nodes recovered\n");
+ trace_ocfs2_recovery_thread_end(status);
/* Refresh all journal recovery generations from disk */
status = ocfs2_check_journals_nolocks(osb);
@@ -1422,7 +1409,7 @@ skip_recovery:
mlog_errno(status);
/* Now it is right time to recover quotas... We have to do this under
- * superblock lock so that noone can start using the slot (and crash)
+ * superblock lock so that no one can start using the slot (and crash)
* before we recover it */
for (i = 0; i < rm_quota_used; i++) {
qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
@@ -1454,10 +1441,8 @@ bail:
mutex_unlock(&osb->recovery_lock);
- if (rm_quota)
- kfree(rm_quota);
+ kfree(rm_quota);
- mlog_exit(status);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
* complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1467,19 +1452,15 @@ bail:
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{
- mlog_entry("(node_num=%d, osb->node_num = %d)\n",
- node_num, osb->node_num);
-
mutex_lock(&osb->recovery_lock);
- if (osb->disable_recovery)
- goto out;
- /* People waiting on recovery will wait on
- * the recovery map to empty. */
- if (ocfs2_recovery_map_set(osb, node_num))
- mlog(0, "node %d already in recovery map.\n", node_num);
+ trace_ocfs2_recovery_thread(node_num, osb->node_num,
+ osb->disable_recovery, osb->recovery_thread_task,
+ osb->disable_recovery ?
+ -1 : ocfs2_recovery_map_set(osb, node_num));
- mlog(0, "starting recovery thread...\n");
+ if (osb->disable_recovery)
+ goto out;
if (osb->recovery_thread_task)
goto out;
@@ -1494,8 +1475,6 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
out:
mutex_unlock(&osb->recovery_lock);
wake_up(&osb->recovery_event);
-
- mlog_exit_void();
}
static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
@@ -1569,7 +1548,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
* If not, it needs recovery.
*/
if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
- mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+ trace_ocfs2_replay_journal_recovered(slot_num,
osb->slot_recovery_generations[slot_num], slot_reco_gen);
osb->slot_recovery_generations[slot_num] = slot_reco_gen;
status = -EBUSY;
@@ -1580,7 +1559,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
- mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
+ trace_ocfs2_replay_journal_lock_err(status);
if (status != -ERESTARTSYS)
mlog(ML_ERROR, "Could not lock journal!\n");
goto done;
@@ -1593,7 +1572,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
slot_reco_gen = ocfs2_get_recovery_generation(fe);
if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
- mlog(0, "No recovery required for node %d\n", node_num);
+ trace_ocfs2_replay_journal_skip(node_num);
/* Refresh recovery generation for the slot */
osb->slot_recovery_generations[slot_num] = slot_reco_gen;
goto done;
@@ -1602,9 +1581,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* we need to run complete recovery for offline orphan slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
- node_num, slot_num,
- MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1614,7 +1593,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
goto done;
}
- mlog(0, "calling journal_init_inode\n");
journal = jbd2_journal_init_inode(inode);
if (journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
@@ -1634,7 +1612,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
ocfs2_clear_journal_error(osb->sb, journal, slot_num);
/* wipe the journal */
- mlog(0, "flushing the journal.\n");
jbd2_journal_lock_updates(journal);
status = jbd2_journal_flush(journal);
jbd2_journal_unlock_updates(journal);
@@ -1661,6 +1638,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
jbd2_journal_destroy(journal);
+ printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+ "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+ MINOR(osb->sb->s_dev));
done:
/* drop the lock on this nodes journal */
if (got_lock)
@@ -1671,7 +1651,6 @@ done:
brelse(bh);
- mlog_exit(status);
return status;
}
@@ -1694,8 +1673,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;
- mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
- node_num, slot_num, osb->node_num);
+ trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
/* Should not ever be called to recover ourselves -- in that
* case we should've called ocfs2_journal_load instead. */
@@ -1704,9 +1682,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
if (status == -EBUSY) {
- mlog(0, "Skipping recovery for slot %u (node %u) "
- "as another node has recovered it\n", slot_num,
- node_num);
+ trace_ocfs2_recover_node_skip(slot_num, node_num);
status = 0;
goto done;
}
@@ -1741,7 +1717,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
status = 0;
done:
- mlog_exit(status);
return status;
}
@@ -1814,8 +1789,8 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
osb->slot_recovery_generations[i] = gen;
- mlog(0, "Slot %u recovery generation is %u\n", i,
- osb->slot_recovery_generations[i]);
+ trace_ocfs2_mark_dead_nodes(i,
+ osb->slot_recovery_generations[i]);
if (i == osb->slot_num) {
spin_unlock(&osb->osb_lock);
@@ -1851,7 +1826,6 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
status = 0;
bail:
- mlog_exit(status);
return status;
}
@@ -1874,6 +1848,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
* every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
* is done to catch any orphans that are left over in orphan directories.
*
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ * Node 1 has an inode it was using. The dentry went away due to memory
+ * pressure. Node 1 closes the inode, but it's on the free list. The node
+ * has the open lock.
+ * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ * but node 1 has no dentry and doesn't get the message. It trylocks the
+ * open lock, sees that another node has a PR, and does nothing.
+ * Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ * open lock, sees the PR still, and does nothing.
+ * Basically, we have to trigger an orphan iput on node 1. The only way
+ * for this to happen is if node 1 runs node 2's orphan dir.
+ *
* ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
* seconds. It gets an EX lock on os_lockres and checks sequence number
* stored in LVB. If the sequence number has changed, it means some other
@@ -1893,6 +1881,9 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
goto out;
+ trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+ atomic_read(&os->os_state));
+
status = ocfs2_orphan_scan_lock(osb, &seqno);
if (status < 0) {
if (status != -EAGAIN)
@@ -1922,6 +1913,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
+ trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+ atomic_read(&os->os_state));
return;
}
@@ -1938,7 +1931,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- schedule_delayed_work(&os->os_orphan_scan_work,
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -1978,12 +1971,13 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- schedule_delayed_work(&os->os_orphan_scan_work,
- ocfs2_orphan_scan_timeout());
+ queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ ocfs2_orphan_scan_timeout());
}
}
struct ocfs2_orphan_filldir_priv {
+ struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
};
@@ -2005,8 +1999,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
if (IS_ERR(iter))
return 0;
- mlog(0, "queue orphan %llu\n",
- (unsigned long long)OCFS2_I(iter)->ip_blkno);
+ trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
/* No locking is required for the next_orphan queue as there
* is only ever a single process doing orphan recovery. */
OCFS2_I(iter)->ip_next_orphan = p->head;
@@ -2021,11 +2014,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
{
int status;
struct inode *orphan_dir_inode = NULL;
- struct ocfs2_orphan_filldir_priv priv;
- loff_t pos = 0;
-
- priv.osb = osb;
- priv.head = *head;
+ struct ocfs2_orphan_filldir_priv priv = {
+ .ctx.actor = ocfs2_orphan_filldir,
+ .osb = osb,
+ .head = *head
+ };
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
@@ -2043,8 +2036,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
- ocfs2_orphan_filldir);
+ status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
if (status) {
mlog_errno(status);
goto out_cluster;
@@ -2122,7 +2114,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
struct inode *iter;
struct ocfs2_inode_info *oi;
- mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+ trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -2135,17 +2127,12 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
while (inode) {
oi = OCFS2_I(inode);
- mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
+ trace_ocfs2_recover_orphans_iput(
+ (unsigned long long)oi->ip_blkno);
iter = oi->ip_next_orphan;
spin_lock(&oi->ip_lock);
- /* The remote delete code may have set these on the
- * assumption that the other node would wipe them
- * successfully. If they are still in the node's
- * orphan dir, we need to reset that state. */
- oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
-
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
@@ -2173,6 +2160,7 @@ static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
* MOUNTED flag, but this is set right before
* dismount_volume() so we can trust it. */
if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+ trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
mlog(0, "mount error, exiting!\n");
return -EBUSY;
}
@@ -2198,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)
|| kthread_should_stop());
status = ocfs2_commit_cache(osb);
- if (status < 0)
- mlog_errno(status);
+ if (status < 0) {
+ static unsigned long abort_warn_time;
+
+ /* Warn about this once per minute */
+ if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+ mlog(ML_ERROR, "status = %d, journal is "
+ "already aborted.\n", status);
+ /*
+ * After ocfs2_commit_cache() fails, j_num_trans has a
+ * non-zero value. Sleep here to avoid a busy-wait
+ * loop.
+ */
+ msleep_interruptible(1000);
+ }
if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
mlog(ML_KTHREAD,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d8..7f8cde94abf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
+ spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
- spinlock_t j_lock;
+ /* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
@@ -199,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
- atomic_set(&osb->needs_checkpoint, 1);
wake_up(&osb->checkpoint_event);
}
@@ -214,7 +214,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
/* WARNING: This only kicks off a single
* checkpoint. If someone races you and adds more
* metadata to the journal, you won't know, and will
- * wind up waiting *alot* longer than necessary. Right
+ * wind up waiting *a lot* longer than necessary. Right
* now we only use this in clear_inode so that's
* OK. */
ocfs2_start_checkpoint(osb);
@@ -258,6 +258,17 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_allocate_extend_trans(handle_t *handle,
+ int thresh);
+
+/*
+ * Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * fallocate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go.
+ */
+#define OCFS2_MAX_TRANS_DATA 64U
/*
* Create access is for when we get a newly created buffer and we're
@@ -325,8 +336,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
* <modify the bh>
* ocfs2_journal_dirty(handle, bh);
*/
-int ocfs2_journal_dirty(handle_t *handle,
- struct buffer_head *bh);
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
/*
* Credit Macros:
@@ -405,9 +415,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
ocfs2_quota_trans_credits(sb);
}
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
{
@@ -441,10 +451,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
@@ -513,8 +524,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
* the result may be wrong.
*/
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_extent_list *root_el,
- u32 bits_wanted)
+ struct ocfs2_extent_list *root_el)
{
int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
@@ -562,6 +572,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
return blocks;
}
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list. The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits(). They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+ return ocfs2_extent_recs_per_gd(sb);
+}
+
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
unsigned int clusters_to_del,
struct ocfs2_dinode *fe,
@@ -604,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
new_size);
}
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f51..04401345562 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,7 +29,6 @@
#include <linux/highmem.h>
#include <linux/bitops.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -43,6 +42,7 @@
#include "suballoc.h"
#include "super.h"
#include "sysfile.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc,
- u32 numbits);
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv);
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,150 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ * group descriptors for other allocations (such as block groups,
+ * etc). Picking default sizes which are a multiple of 4 could help
+ * - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ * file systems. This can easily be taken care of by limiting our
+ * default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ * particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K group: 126M la: 121M
+ * csize: 8K group: 252M la: 243M
+ * csize: 16K group: 504M la: 486M
+ * csize: 32K group: 1008M la: 972M
+ * csize: 64K group: 2016M la: 1944M
+ * csize: 128K group: 4032M la: 3888M
+ * csize: 256K group: 8064M la: 7776M
+ * csize: 512K group: 16128M la: 15552M
+ * csize: 1024K group: 32256M la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT 8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+ unsigned int la_mb;
+ unsigned int gd_mb;
+ unsigned int la_max_mb;
+ unsigned int megs_per_slot;
+ struct super_block *sb = osb->sb;
+
+ gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ 8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+
+ /*
+ * This takes care of files systems with very small group
+ * descriptors - 512 byte blocksize at cluster sizes lower
+ * than 16K and also 1k blocksize with 4k cluster size.
+ */
+ if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+ || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+ return OCFS2_LA_OLD_DEFAULT;
+
+ /*
+ * Leave enough room for some block groups and make the final
+ * value we work from a multiple of 4.
+ */
+ gd_mb -= 16;
+ gd_mb &= 0xFFFFFFFB;
+
+ la_mb = gd_mb;
+
+ /*
+ * Keep window sizes down to a reasonable default
+ */
+ if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+ /*
+ * Some clustersize / blocksize combinations will have
+ * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+ * default size, but get poor distribution when
+ * limited to exactly 256 megabytes.
+ *
+ * As an example, 16K clustersize at 4K blocksize
+ * gives us a cluster group size of 504M. Paring the
+ * local alloc size down to 256 however, would give us
+ * only one window and around 200MB left in the
+ * cluster group. Instead, find the first size below
+ * 256 which would give us an even distribution.
+ *
+ * Larger cluster group sizes actually work out pretty
+ * well when pared to 256, so we don't have to do this
+ * for any group that fits more than two
+ * OCFS2_LA_MAX_DEFAULT_MB windows.
+ */
+ if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+ la_mb = 256;
+ else {
+ unsigned int gd_mult = gd_mb;
+
+ while (gd_mult > 256)
+ gd_mult = gd_mult >> 1;
+
+ la_mb = gd_mult;
+ }
+ }
+
+ megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+ megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+ /* Too many nodes, too few disk clusters. */
+ if (megs_per_slot < la_mb)
+ la_mb = megs_per_slot;
+
+ /* We can't store more bits than we can in a block. */
+ la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ ocfs2_local_alloc_size(sb) * 8);
+ if (la_mb > la_max_mb)
+ la_mb = la_max_mb;
+
+ return la_mb;
+}
+
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+ struct super_block *sb = osb->sb;
+ unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+ unsigned int la_max_mb;
+
+ la_max_mb = ocfs2_clusters_to_megabytes(sb,
+ ocfs2_local_alloc_size(sb) * 8);
+
+ trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
+
+ if (requested_mb == -1) {
+ /* No user request - use defaults */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_default_mb);
+ } else if (requested_mb > la_max_mb) {
+ /* Request is too big, we give the maximum available */
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, la_max_mb);
+ } else {
+ osb->local_alloc_default_bits =
+ ocfs2_megabytes_to_clusters(sb, requested_mb);
+ }
+
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
{
return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -130,8 +275,8 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
ret = 1;
bail:
- mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
- osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+ trace_ocfs2_alloc_should_use_local(
+ (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
spin_unlock(&osb->osb_lock);
return ret;
}
@@ -145,8 +290,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
struct inode *inode = NULL;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
-
if (osb->local_alloc_bits == 0)
goto bail;
@@ -156,7 +299,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_bits, (osb->bitmap_cpg - 1));
osb->local_alloc_bits =
ocfs2_megabytes_to_clusters(osb->sb,
- OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+ ocfs2_la_default_mb(osb));
}
/* read the alloc off disk */
@@ -218,9 +361,10 @@ bail:
if (inode)
iput(inode);
- mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
+ trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -242,8 +386,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_dinode *alloc = NULL;
- mlog_entry_void();
-
cancel_delayed_work(&osb->la_enable_wq);
flush_workqueue(ocfs2_wq);
@@ -262,6 +404,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_state = OCFS2_LA_DISABLED;
+ ocfs2_resmap_uninit(&osb->osb_la_resmap);
+
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -305,12 +449,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
osb->local_alloc_bh = NULL;
@@ -337,10 +476,7 @@ out:
if (local_alloc_inode)
iput(local_alloc_inode);
- if (alloc_copy)
- kfree(alloc_copy);
-
- mlog_exit_void();
+ kfree(alloc_copy);
}
/*
@@ -359,7 +495,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
struct inode *inode = NULL;
struct ocfs2_dinode *alloc;
- mlog_entry("(slot_num = %d)\n", slot_num);
+ trace_ocfs2_begin_local_alloc_recovery(slot_num);
*alloc_copy = NULL;
@@ -397,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mlog_errno(status);
bail:
- if ((status < 0) && (*alloc_copy)) {
+ if (status < 0) {
kfree(*alloc_copy);
*alloc_copy = NULL;
}
@@ -409,7 +545,8 @@ bail:
iput(inode);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -427,8 +564,6 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
struct buffer_head *main_bm_bh = NULL;
struct inode *main_bm_inode;
- mlog_entry_void();
-
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -477,50 +612,11 @@ out_mutex:
out:
if (!status)
ocfs2_init_steal_slots(osb);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
- struct ocfs2_alloc_context *ac,
- u32 bits_wanted)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_dinode *alloc;
- struct ocfs2_local_alloc *la;
- int start;
- u64 block_off;
-
- if (!ac->ac_max_block)
- return 1;
-
- alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
- la = OCFS2_LOCAL_ALLOC(alloc);
-
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
- if (start == -1) {
- mlog_errno(-ENOSPC);
- return 0;
- }
-
- /*
- * Converting (bm_off + start + bits_wanted) to blocks gives us
- * the blkno just past our actual allocation. This is perfect
- * to compare with ac_max_block.
- */
- block_off = ocfs2_clusters_to_blocks(inode->i_sb,
- le32_to_cpu(la->la_bm_off) +
- start + bits_wanted);
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)block_off,
- (unsigned long long)ac->ac_max_block);
- if (block_off > ac->ac_max_block)
- return 0;
-
- return 1;
-}
-
/*
* make sure we've got at least bits_wanted contiguous bits in the
* local alloc. You lose them when you drop i_mutex.
@@ -537,8 +633,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
struct inode *local_alloc_inode;
unsigned int free_bits;
- mlog_entry_void();
-
BUG_ON(!ac);
local_alloc_inode =
@@ -609,21 +703,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
- if (ac->ac_max_block)
- mlog(0, "Calling in_range for max block %llu\n",
- (unsigned long long)ac->ac_max_block);
-
- if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
- bits_wanted)) {
- /*
- * The window is outside ac->ac_max_block.
- * This errno tells the caller to keep localalloc enabled
- * but to get the allocation from the main bitmap.
- */
- status = -EFBIG;
- goto bail;
- }
-
ac->ac_inode = local_alloc_inode;
/* We should never use localalloc from another slot */
ac->ac_alloc_slot = osb->slot_num;
@@ -637,10 +716,12 @@ bail:
iput(local_alloc_inode);
}
- mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
- status);
+ trace_ocfs2_reserve_local_alloc_bits(
+ (unsigned long long)ac->ac_max_block,
+ bits_wanted, osb->slot_num, status);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -657,14 +738,14 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
local_alloc_inode = ac->ac_inode;
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+ start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+ ac->ac_resv);
if (start == -1) {
/* TODO: Shouldn't we just BUG here? */
status = -ENOSPC;
@@ -674,8 +755,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
bitmap = la->la_bitmap;
*bit_off = le32_to_cpu(la->la_bm_off) + start;
- /* local alloc is always contiguous by nature -- we never
- * delete bits from it! */
*num_bits = bits_wanted;
status = ocfs2_journal_access_di(handle,
@@ -687,55 +766,114 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
+ ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+ bits_wanted);
+
while(bits_wanted--)
ocfs2_set_bit(start++, bitmap);
le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+bail:
+ if (status)
+ mlog_errno(status);
+ return status;
+}
+
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits)
+{
+ int status, start;
+ u32 clear_bits;
+ struct inode *local_alloc_inode;
+ void *bitmap;
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+ local_alloc_inode = ac->ac_inode;
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ bitmap = la->la_bitmap;
+ start = bit_off - le32_to_cpu(la->la_bm_off);
+ clear_bits = num_bits;
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = 0;
+ while (clear_bits--)
+ ocfs2_clear_bit(start++, bitmap);
+
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
bail:
- mlog_exit(status);
return status;
}
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
- int i;
- u8 *buffer;
- u32 count = 0;
+ u32 count;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
- mlog_entry_void();
-
- buffer = la->la_bitmap;
- for (i = 0; i < le16_to_cpu(la->la_size); i++)
- count += hweight8(buffer[i]);
+ count = memweight(la->la_bitmap, le16_to_cpu(la->la_size));
- mlog_exit(count);
+ trace_ocfs2_local_alloc_count_bits(count);
return count;
}
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
- struct ocfs2_dinode *alloc,
- u32 numbits)
+ struct ocfs2_dinode *alloc,
+ u32 *numbits,
+ struct ocfs2_alloc_reservation *resv)
{
int numfound, bitoff, left, startoff, lastzero;
+ int local_resv = 0;
+ struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
-
- mlog_entry("(numbits wanted = %u)\n", numbits);
+ struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
if (!alloc->id1.bitmap1.i_total) {
- mlog(0, "No bits in my window!\n");
bitoff = -1;
goto bail;
}
+ if (!resv) {
+ local_resv = 1;
+ ocfs2_resv_init_once(&r);
+ ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+ resv = &r;
+ }
+
+ numfound = *numbits;
+ if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+ if (numfound < *numbits)
+ *numbits = numfound;
+ goto bail;
+ }
+
+ /*
+ * Code error. While reservations are enabled, local
+ * allocation should _always_ go through them.
+ */
+ BUG_ON(osb->osb_resv_level != 0);
+
+ /*
+ * Reservations are disabled. Handle this the old way.
+ */
+
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
@@ -761,22 +899,27 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
startoff = bitoff+1;
}
/* we got everything we needed */
- if (numfound == numbits) {
+ if (numfound == *numbits) {
/* mlog(0, "Found it all!\n"); */
break;
}
}
- mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
- numfound);
+ trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
- if (numfound == numbits)
+ if (numfound == *numbits)
bitoff = startoff - numfound;
else
bitoff = -1;
bail:
- mlog_exit(bitoff);
+ if (local_resv)
+ ocfs2_resv_discard(resmap, resv);
+
+ trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ bitoff, numfound);
+
return bitoff;
}
@@ -784,15 +927,12 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
{
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
int i;
- mlog_entry_void();
alloc->id1.bitmap1.i_total = 0;
alloc->id1.bitmap1.i_used = 0;
la->la_bm_off = 0;
for(i = 0; i < le16_to_cpu(la->la_size); i++)
la->la_bitmap[i] = 0;
-
- mlog_exit_void();
}
#if 0
@@ -833,18 +973,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
void *bitmap;
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
- mlog_entry("total = %u, used = %u\n",
- le32_to_cpu(alloc->id1.bitmap1.i_total),
- le32_to_cpu(alloc->id1.bitmap1.i_used));
+ trace_ocfs2_sync_local_to_main(
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ le32_to_cpu(alloc->id1.bitmap1.i_used));
if (!alloc->id1.bitmap1.i_total) {
- mlog(0, "nothing to sync!\n");
goto bail;
}
if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
le32_to_cpu(alloc->id1.bitmap1.i_total)) {
- mlog(0, "all bits were taken!\n");
goto bail;
}
@@ -866,14 +1004,15 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
ocfs2_clusters_to_blocks(osb->sb,
start - count);
- mlog(0, "freeing %u bits starting at local alloc bit "
- "%u (la_start_blk = %llu, blkno = %llu)\n",
+ trace_ocfs2_sync_local_to_main_free(
count, start - count,
(unsigned long long)la_start_blk,
(unsigned long long)blkno);
- status = ocfs2_free_clusters(handle, main_bm_inode,
- main_bm_bh, blkno, count);
+ status = ocfs2_release_clusters(handle,
+ main_bm_inode,
+ main_bm_bh, blkno,
+ count);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -886,7 +1025,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -985,7 +1125,6 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
retry_enospc:
(*ac)->ac_bits_wanted = osb->local_alloc_bits;
-
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
if (status == -ENOSPC) {
if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1012,7 +1151,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1028,17 +1168,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc = NULL;
struct ocfs2_local_alloc *la;
- mlog_entry_void();
-
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
- if (alloc->id1.bitmap1.i_total)
- mlog(0, "asking me to alloc a new window over a non-empty "
- "one\n");
-
- mlog(0, "Allocating %u clusters for a new window.\n",
- osb->local_alloc_bits);
+ trace_ocfs2_local_alloc_new_window(
+ le32_to_cpu(alloc->id1.bitmap1.i_total),
+ osb->local_alloc_bits);
/* Instruct the allocation code to try the most recently used
* cluster group. We'll re-record the group used this pass
@@ -1048,7 +1183,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
- status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+ status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count);
if (status == -ENOSPC) {
retry_enospc:
@@ -1061,7 +1196,8 @@ retry_enospc:
OCFS2_LA_DISABLED)
goto bail;
- status = ocfs2_claim_clusters(osb, handle, ac,
+ ac->ac_bits_wanted = osb->local_alloc_bits;
+ status = ocfs2_claim_clusters(handle, ac,
osb->local_alloc_bits,
&cluster_off,
&cluster_count);
@@ -1096,13 +1232,16 @@ retry_enospc:
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
le16_to_cpu(la->la_size));
- mlog(0, "New window allocated:\n");
- mlog(0, "window la_bm_off = %u\n",
- OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
- mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+ ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+ OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
+
+ trace_ocfs2_local_alloc_new_window_result(
+ OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+ le32_to_cpu(alloc->id1.bitmap1.i_total));
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1119,8 +1258,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct ocfs2_dinode *alloc_copy = NULL;
struct ocfs2_alloc_context *ac = NULL;
- mlog_entry_void();
-
ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
/* This will lock the main bitmap for us. */
@@ -1167,12 +1304,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
ocfs2_clear_local_alloc(alloc);
-
- status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
main_bm_inode, main_bm_bh);
@@ -1190,7 +1322,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
atomic_inc(&osb->alloc_stats.moves);
- status = 0;
bail:
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -1200,13 +1331,13 @@ bail:
if (main_bm_inode)
iput(main_bm_inode);
- if (alloc_copy)
- kfree(alloc_copy);
+ kfree(alloc_copy);
if (ac)
ocfs2_free_alloc_context(ac);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f8665..44a7d1fb2de 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
+
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
@@ -52,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits);
+
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters);
void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac624517..6b6d092b099 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/fcntl.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -83,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
}
ret = flock_lock_file_wait(file, fl);
+ if (ret)
+ ocfs2_file_unlock(file);
out:
mutex_unlock(&fp->fp_mutex);
@@ -133,7 +134,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (__mandatory_lock(inode))
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
return -ENOLCK;
return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424..10d66c75cec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,14 +25,12 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/signal.h>
#include <linux/rbtree.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -42,52 +40,29 @@
#include "file.h"
#include "inode.h"
#include "mmap.h"
+#include "super.h"
+#include "ocfs2_trace.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
- /* The best way to deal with signals in the vm path is
- * to block them upfront, rather than allowing the
- * locking paths to return -ERESTARTSYS. */
- sigfillset(blocked);
-
- /* We should technically never get a bad return value
- * from sigprocmask */
- return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
- return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
{
- sigset_t blocked, oldset;
- int error, ret;
-
- mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-
- error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
- if (error < 0) {
- mlog_errno(error);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
+ sigset_t oldset;
+ int ret;
+ ocfs2_block_signals(&oldset);
ret = filemap_fault(area, vmf);
+ ocfs2_unblock_signals(&oldset);
- error = ocfs2_vm_op_unblock_sigs(&oldset);
- if (error < 0)
- mlog_errno(error);
-out:
- mlog_exit_ptr(vmf->page);
+ trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+ area, vmf->page, vmf->pgoff);
return ret;
}
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
- int ret;
+ int ret = VM_FAULT_NOPAGE;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
unsigned int len = PAGE_CACHE_SIZE;
@@ -96,30 +71,25 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
void *fsdata;
loff_t size = i_size_read(inode);
- /*
- * Another node might have truncated while we were waiting on
- * cluster locks.
- */
- last_index = size >> PAGE_CACHE_SHIFT;
- if (page->index > last_index) {
- ret = -EINVAL;
- goto out;
- }
+ last_index = (size - 1) >> PAGE_CACHE_SHIFT;
/*
- * The i_size check above doesn't catch the case where nodes
- * truncated and then re-extended the file. We'll re-check the
- * page mapping after taking the page lock inside of
- * ocfs2_write_begin_nolock().
+ * There are cases that lead to the page no longer bebongs to the
+ * mapping.
+ * 1) pagecache truncates locally due to memory pressure.
+ * 2) pagecache truncates when another is taking EX lock against
+ * inode lock. see ocfs2_data_convert_worker.
+ *
+ * The i_size check doesn't catch the case where nodes truncated and
+ * then re-extended the file. We'll re-check the page mapping after
+ * taking the page lock inside of ocfs2_write_begin_nolock().
+ *
+ * Let VM retry with these cases.
*/
- if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
- /*
- * the page has been umapped in ocfs2_data_downconvert_worker.
- * So return 0 here and let VFS retry.
- */
- ret = 0;
+ if ((page->mapping != inode->i_mapping) ||
+ (!PageUptodate(page)) ||
+ (page_offset(page) >= size))
goto out;
- }
/*
* Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -132,24 +102,28 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
- len = size & ~PAGE_CACHE_MASK;
+ len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
&fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
- if (ret < 0) {
- mlog_errno(ret);
+ if (!locked_page) {
+ ret = VM_FAULT_NOPAGE;
goto out;
}
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
BUG_ON(ret != len);
- ret = 0;
+ ret = VM_FAULT_LOCKED;
out:
return ret;
}
@@ -157,16 +131,13 @@ out:
static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(vma->vm_file);
struct buffer_head *di_bh = NULL;
- sigset_t blocked, oldset;
- int ret, ret2;
+ sigset_t oldset;
+ int ret;
- ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
+ sb_start_pagefault(inode->i_sb);
+ ocfs2_block_signals(&oldset);
/*
* The cluster locks taken will block a truncate from another
@@ -186,7 +157,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+ ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
@@ -194,33 +165,30 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ocfs2_inode_unlock(inode, 1);
out:
- ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
- if (ret2 < 0)
- mlog_errno(ret2);
- if (ret)
- ret = VM_FAULT_SIGBUS;
+ ocfs2_unblock_signals(&oldset);
+ sb_end_pagefault(inode->i_sb);
return ret;
}
static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret = 0, lock_level = 0;
- ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
- file->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(file_inode(file),
+ file->f_path.mnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
+ ocfs2_inode_unlock(file_inode(file), lock_level);
out:
vma->vm_ops = &ocfs2_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644
index 00000000000..599eb4c4c8b
--- /dev/null
+++ b/fs/ocfs2/move_extents.c
@@ -0,0 +1,1078 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+ struct inode *inode;
+ struct file *file;
+ int auto_defrag;
+ int partial;
+ int credits;
+ u32 new_phys_cpos;
+ u32 clusters_moved;
+ u64 refcount_loc;
+ struct ocfs2_move_extents *range;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+ struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+ int ext_flags)
+{
+ int ret = 0, index;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_rec *rec, replace_rec;
+ struct ocfs2_path *path = NULL;
+ struct ocfs2_extent_list *el;
+ u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+ u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+ ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
+ p_cpos, new_p_cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ memset(&replace_rec, 0, sizeof(replace_rec));
+ replace_rec.e_cpos = cpu_to_le32(cpos);
+ replace_rec.e_leaf_clusters = cpu_to_le16(len);
+ replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+ new_p_cpos));
+
+ path = ocfs2_new_path_from_et(&context->et);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)ino, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ rec = &el->l_recs[index];
+
+ BUG_ON(ext_flags != rec->e_flags);
+ /*
+ * after moving/defraging to new location, the extent is not going
+ * to be refcounted anymore.
+ */
+ replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ context->et.et_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_split_extent(handle, &context->et, path, index,
+ &replace_rec, context->meta_ac,
+ &context->dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+ context->new_phys_cpos = new_p_cpos;
+
+ /*
+ * need I to append truncate log for old clusters?
+ */
+ if (old_blkno) {
+ if (ext_flags & OCFS2_EXT_REFCOUNTED)
+ ret = ocfs2_decrease_refcount(inode, handle,
+ ocfs2_blocks_to_clusters(osb->sb,
+ old_blkno),
+ len, context->meta_ac,
+ &context->dealloc, 1);
+ else
+ ret = ocfs2_truncate_log_append(osb, handle,
+ old_blkno, len);
+ }
+
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_move,
+ u32 extents_to_split,
+ struct ocfs2_alloc_context **meta_ac,
+ struct ocfs2_alloc_context **data_ac,
+ int extra_blocks,
+ int *credits)
+{
+ int ret, num_free_extents;
+ unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ num_free_extents = ocfs2_num_free_extents(osb, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+ extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+ ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (data_ac) {
+ ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
+
+ mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+ extra_blocks, clusters_to_move, *credits);
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ * XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 new_phys_cpos, new_len;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ *len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+ &context->meta_ac,
+ &context->data_ac,
+ extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * should be using allocation reservation strategy there?
+ *
+ * if (context->data_ac)
+ * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+ */
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_mutex;
+ }
+
+ ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+ &new_phys_cpos, &new_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * allowing partial extent moving is kind of 'pros and cons', it makes
+ * whole defragmentation less likely to fail, on the contrary, the bad
+ * thing is it may make the fs even more fragmented after moving, let
+ * userspace make a good decision here.
+ */
+ if (new_len != *len) {
+ mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+ if (!partial) {
+ context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+ }
+
+ mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+ phys_cpos, new_phys_cpos);
+
+ ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+ new_phys_cpos, ext_flags);
+ if (ret)
+ mlog_errno(ret);
+
+ if (partial && (new_len != *len))
+ *len = new_len;
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (context->data_ac) {
+ ocfs2_free_alloc_context(context->data_ac);
+ context->data_ac = NULL;
+ }
+
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+out:
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+ u64 vict_blkno,
+ int type, int slot,
+ int *vict_bit,
+ struct buffer_head **ret_bh)
+{
+ int ret, i, bits_per_unit = 0;
+ u64 blkno;
+ char namebuf[40];
+
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+ struct ocfs2_chain_list *cl;
+ struct ocfs2_chain_rec *rec;
+ struct ocfs2_dinode *ac_dinode;
+ struct ocfs2_group_desc *bg;
+
+ ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+ ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+ strlen(namebuf), &blkno);
+ if (ret) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+ cl = &(ac_dinode->id2.i_chain);
+ rec = &(cl->cl_recs[0]);
+
+ if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+ bits_per_unit = osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits;
+ /*
+ * 'vict_blkno' was out of the valid range.
+ */
+ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+ (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
+ bits_per_unit))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+ rec = &(cl->cl_recs[i]);
+ if (!rec)
+ continue;
+
+ bg = NULL;
+
+ do {
+ if (!bg)
+ blkno = le64_to_cpu(rec->c_blkno);
+ else
+ blkno = le64_to_cpu(bg->bg_next_group);
+
+ if (gd_bh) {
+ brelse(gd_bh);
+ gd_bh = NULL;
+ }
+
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+ le16_to_cpu(bg->bg_bits))) {
+
+ *ret_bh = gd_bh;
+ *vict_bit = (vict_blkno - blkno) >>
+ bits_per_unit;
+ mlog(0, "find the victim group: #%llu, "
+ "total_bits: %u, vict_bit: %u\n",
+ blkno, le16_to_cpu(bg->bg_bits),
+ *vict_bit);
+ goto out;
+ }
+
+ } while (le64_to_cpu(bg->bg_next_group));
+ }
+
+ ret = -EINVAL;
+out:
+ brelse(ac_bh);
+
+ /*
+ * caller has to release the gd_bh properly.
+ */
+ return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+ struct ocfs2_move_extents *range)
+{
+ int ret, goal_bit = 0;
+
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int c_to_b = 1 << (osb->s_clustersize_bits -
+ inode->i_sb->s_blocksize_bits);
+
+ /*
+ * make goal become cluster aligned.
+ */
+ range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
+ range->me_goal);
+ /*
+ * validate goal sits within global_bitmap, and return the victim
+ * group desc
+ */
+ ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret)
+ goto out;
+
+ bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+ /*
+ * moving goal is not allowd to start with a group desc blok(#0 blk)
+ * let's compromise to the latter cluster.
+ */
+ if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+ range->me_goal += c_to_b;
+
+ /*
+ * movement is not gonna cross two groups.
+ */
+ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+ range->me_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /*
+ * more exact validations/adjustments will be performed later during
+ * moving operation for each extent range.
+ */
+ mlog(0, "extents get ready to be moved to #%llu block\n",
+ range->me_goal);
+
+out:
+ brelse(gd_bh);
+
+ return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+ int *goal_bit, u32 move_len, u32 max_hop,
+ u32 *phys_cpos)
+{
+ int i, used, last_free_bits = 0, base_bit = *goal_bit;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+ u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ le64_to_cpu(gd->bg_blkno));
+
+ for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+ used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+ if (used) {
+ /*
+ * we even tried searching the free chunk by jumping
+ * a 'max_hop' distance, but still failed.
+ */
+ if ((i - base_bit) > max_hop) {
+ *phys_cpos = 0;
+ break;
+ }
+
+ if (last_free_bits)
+ last_free_bits = 0;
+
+ continue;
+ } else
+ last_free_bits++;
+
+ if (last_free_bits == move_len) {
+ *goal_bit = i;
+ *phys_cpos = base_cpos + i;
+ break;
+ }
+ }
+
+ mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+ u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+ u32 len, int ext_flags)
+{
+ int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct inode *gb_inode = NULL;
+ struct buffer_head *gb_bh = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_group_desc *gd;
+ struct ocfs2_refcount_tree *ref_tree = NULL;
+ u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+ context->range->me_threshold);
+ u64 phys_blkno, new_phys_blkno;
+
+ phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+ if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+ BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+ OCFS2_HAS_REFCOUNT_FL));
+
+ BUG_ON(!context->refcount_loc);
+
+ ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+ &ref_tree, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_refcount_change_for_del(inode,
+ context->refcount_loc,
+ phys_blkno,
+ len,
+ &credits,
+ &extra_blocks);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+ &context->meta_ac,
+ NULL, extra_blocks, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * need to count 2 extra credits for global_bitmap inode and
+ * group descriptor.
+ */
+ credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+ /*
+ * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+ * logic, while we still need to lock the global_bitmap.
+ */
+ gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!gb_inode) {
+ mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ mutex_lock(&gb_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_gb_mutex;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock_tl_inode;
+ }
+
+ new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+ ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT,
+ &goal_bit, &gd_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * probe the victim cluster group to find a proper
+ * region to fit wanted movement, it even will perfrom
+ * a best-effort attempt by compromising to a threshold
+ * around the goal.
+ */
+ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+ new_phys_cpos);
+ if (!*new_phys_cpos) {
+ ret = -ENOSPC;
+ goto out_commit;
+ }
+
+ ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+ *new_phys_cpos, ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+ goal_bit, len);
+ if (ret) {
+ ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
+ mlog_errno(ret);
+ }
+
+ /*
+ * Here we should write the new page out first if we are
+ * in write-back mode.
+ */
+ ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+ brelse(gd_bh);
+
+out_unlock_tl_inode:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+ mutex_unlock(&gb_inode->i_mutex);
+ brelse(gb_bh);
+ iput(gb_inode);
+
+out:
+ if (context->meta_ac) {
+ ocfs2_free_alloc_context(context->meta_ac);
+ context->meta_ac = NULL;
+ }
+
+ if (ref_tree)
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+ return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+ u32 threshold, int *skip)
+{
+ if ((*alloc_size + *len_defraged) < threshold) {
+ /*
+ * proceed defragmentation until we meet the thresh
+ */
+ *len_defraged += *alloc_size;
+ } else if (*len_defraged == 0) {
+ /*
+ * XXX: skip a large extent.
+ */
+ *skip = 1;
+ } else {
+ /*
+ * split this extent to coalesce with former pieces as
+ * to reach the threshold.
+ *
+ * we're done here with one cycle of defragmentation
+ * in a size of 'thresh', resetting 'len_defraged'
+ * forces a new defragmentation.
+ */
+ *alloc_size = threshold - *len_defraged;
+ *len_defraged = 0;
+ }
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+ struct ocfs2_move_extents_context *context)
+{
+ int ret = 0, flags, do_defrag, skip = 0;
+ u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+ u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_move_extents *range = context->range;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if ((i_size_read(inode) == 0) || (range->me_len == 0))
+ return 0;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+ ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+ ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+ /*
+ * TO-DO XXX:
+ *
+ * - xattr extents.
+ */
+
+ do_defrag = context->auto_defrag;
+
+ /*
+ * extents moving happens in unit of clusters, for the sake
+ * of simplicity, we may ignore two clusters where 'byte_start'
+ * and 'byte_start + len' were within.
+ */
+ move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+ len_to_move = (range->me_start + range->me_len) >>
+ osb->s_clustersize_bits;
+ if (len_to_move >= move_start)
+ len_to_move -= move_start;
+ else
+ len_to_move = 0;
+
+ if (do_defrag) {
+ defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+ if (defrag_thresh <= 1)
+ goto done;
+ } else
+ new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ range->me_goal);
+
+ mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+ "thresh: %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)range->me_start,
+ (unsigned long long)range->me_len,
+ move_start, len_to_move, defrag_thresh);
+
+ cpos = move_start;
+ while (len_to_move) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+ &flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > len_to_move)
+ alloc_size = len_to_move;
+
+ /*
+ * XXX: how to deal with a hole:
+ *
+ * - skip the hole of course
+ * - force a new defragmentation
+ */
+ if (!phys_cpos) {
+ if (do_defrag)
+ len_defraged = 0;
+
+ goto next;
+ }
+
+ if (do_defrag) {
+ ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+ defrag_thresh, &skip);
+ /*
+ * skip large extents
+ */
+ if (skip) {
+ skip = 0;
+ goto next;
+ }
+
+ mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+ "alloc_size: %u, len_defraged: %u\n",
+ cpos, phys_cpos, alloc_size, len_defraged);
+
+ ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+ &alloc_size, flags);
+ } else {
+ ret = ocfs2_move_extent(context, cpos, phys_cpos,
+ &new_phys_cpos, alloc_size,
+ flags);
+
+ new_phys_cpos += alloc_size;
+ }
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ context->clusters_moved += alloc_size;
+next:
+ cpos += alloc_size;
+ len_to_move -= alloc_size;
+ }
+
+done:
+ range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+ range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+ context->clusters_moved);
+ range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+ context->new_phys_cpos);
+
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &context->dealloc);
+
+ return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+ int status;
+ handle_t *handle;
+ struct inode *inode = context->inode;
+ struct ocfs2_dinode *di;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!inode)
+ return -ENOENT;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * This prevents concurrent writes from other nodes
+ */
+ status = ocfs2_rw_lock(inode, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_rw_unlock;
+ }
+
+ /*
+ * rememer ip_xattr_sem also needs to be held if necessary
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ status = __ocfs2_move_extents_range(di_bh, context);
+
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ if (status) {
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ /*
+ * We update ctime for these changes
+ */
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_inode_unlock;
+ }
+
+ status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
+
+ ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+ brelse(di_bh);
+ ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+ ocfs2_rw_unlock(inode, 1);
+out:
+ mutex_unlock(&inode->i_mutex);
+
+ return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+ int status;
+
+ struct inode *inode = file_inode(filp);
+ struct ocfs2_move_extents range;
+ struct ocfs2_move_extents_context *context;
+
+ if (!argp)
+ return -EINVAL;
+
+ status = mnt_want_write_file(filp);
+ if (status)
+ return status;
+
+ if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+ status = -EPERM;
+ goto out_drop;
+ }
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ status = -EPERM;
+ goto out_drop;
+ }
+
+ context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+ if (!context) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_drop;
+ }
+
+ context->inode = inode;
+ context->file = filp;
+
+ if (copy_from_user(&range, argp, sizeof(range))) {
+ status = -EFAULT;
+ goto out_free;
+ }
+
+ if (range.me_start > i_size_read(inode)) {
+ status = -EINVAL;
+ goto out_free;
+ }
+
+ if (range.me_start + range.me_len > i_size_read(inode))
+ range.me_len = i_size_read(inode) - range.me_start;
+
+ context->range = &range;
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+ context->auto_defrag = 1;
+ /*
+ * ok, the default theshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
+ if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+ context->partial = 1;
+ } else {
+ /*
+ * first best-effort attempt to validate and adjust the goal
+ * (physical address in block), while it can't guarantee later
+ * operation can succeed all the time since global_bitmap may
+ * change a bit over time.
+ */
+
+ status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+ if (status)
+ goto out_copy;
+ }
+
+ status = ocfs2_move_extents(context);
+ if (status)
+ mlog_errno(status);
+out_copy:
+ /*
+ * movement/defragmentation may end up being partially completed,
+ * that's the reason why we need to return userspace the finished
+ * length and new_offset even if failure happens somewhere.
+ */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ status = -EFAULT;
+
+out_free:
+ kfree(context);
+out_drop:
+ mnt_drop_write_file(filp);
+
+ return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644
index 00000000000..4e143e81144
--- /dev/null
+++ b/fs/ocfs2/move_extents.h
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a5..8add6f1030d 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -42,7 +42,6 @@
#include <linux/highmem.h>
#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -63,6 +62,7 @@
#include "uptodate.h"
#include "xattr.h"
#include "acl.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct ocfs2_dinode *fe,
+ struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
struct inode *orphan_dir_inode);
@@ -98,7 +98,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
+ unsigned int flags)
{
int status;
u64 blkno;
@@ -106,17 +106,15 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *ret;
struct ocfs2_inode_info *oi;
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
ret = ERR_PTR(-ENAMETOOLONG);
goto bail;
}
- mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
- dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-
status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
@@ -147,7 +145,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
spin_unlock(&oi->ip_lock);
bail_add:
- dentry->d_op = &ocfs2_dentry_ops;
ret = d_splice_alias(inode, dentry);
if (inode) {
@@ -171,7 +168,8 @@ bail_add:
ret = ERR_PTR(status);
goto bail_unlock;
}
- }
+ } else
+ ocfs2_dentry_attach_gen(dentry);
bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
@@ -182,12 +180,12 @@ bail_unlock:
bail:
- mlog_exit_ptr(ret);
+ trace_ocfs2_lookup_ret(ret);
return ret;
}
-static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
{
struct inode *inode;
@@ -201,24 +199,30 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
* these are used by the support functions here and in
* callers. */
if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
+ set_nlink(inode, 2);
+ inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
return inode;
}
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+ struct dentry *dentry, struct inode *inode)
+{
+ struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+}
+
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
- int mode,
+ umode_t mode,
dev_t dev)
{
int status = 0;
@@ -239,10 +243,14 @@ static int ocfs2_mknod(struct inode *dir,
};
int did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
+ struct ocfs2_dentry_lock *dl = NULL;
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
- (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
+ trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long)dev, mode);
dquot_initialize(dir);
@@ -298,7 +306,7 @@ static int ocfs2_mknod(struct inode *dir,
}
/* get security xattr */
- status = ocfs2_init_security_get(inode, dir, &si);
+ status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
if (status) {
if (status == -EOPNOTSUPP)
si.enable = 0;
@@ -340,6 +348,12 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -350,15 +364,15 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
status = dquot_alloc_inode(inode);
if (status)
goto leave;
did_quota_inode = 1;
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
- inode->i_mode, (unsigned long)dev, dentry->d_name.len,
- dentry->d_name.name);
-
/* do the real work now. */
status = ocfs2_mknod_locked(osb, dir, inode, dev,
&new_fe_bh, parent_fe_bh, handle,
@@ -384,16 +398,21 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
ocfs2_add_links_count(dirfe, 1);
- status = ocfs2_journal_dirty(handle, parent_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, parent_fe_bh);
inc_nlink(dir);
}
- status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
- meta_ac, data_ac);
+ if (default_acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_DEFAULT, default_acl,
+ meta_ac, data_ac);
+ }
+ if (!status && acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_ACCESS, acl,
+ meta_ac, data_ac);
+ }
+
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -408,48 +427,51 @@ static int ocfs2_mknod(struct inode *dir,
}
}
- status = ocfs2_add_entry(handle, dentry, inode,
- OCFS2_I(inode)->ip_blkno, parent_fe_bh,
- &lookup);
- if (status < 0) {
+ /*
+ * Do this before adding the entry to the directory. We add
+ * also set d_op after success so that ->d_iput() will cleanup
+ * the dentry lock even if ocfs2_add_entry() fails below.
+ */
+ status = ocfs2_dentry_attach_lock(dentry, inode,
+ OCFS2_I(dir)->ip_blkno);
+ if (status) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_dentry_attach_lock(dentry, inode,
- OCFS2_I(dir)->ip_blkno);
- if (status) {
+ dl = dentry->d_fsdata;
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+ &lookup);
+ if (status < 0) {
mlog_errno(status);
goto leave;
}
insert_inode_hash(inode);
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
status = 0;
leave:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
ocfs2_inode_unlock(dir, 1);
-
- if (status == -ENOSPC)
- mlog(0, "Disk is full\n");
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
- if ((status < 0) && inode) {
- clear_nlink(inode);
- iput(inode);
- }
-
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
@@ -459,36 +481,44 @@ leave:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
- mlog_exit(status);
+ /*
+ * We should call iput after the i_mutex of the bitmap been
+ * unlocked in ocfs2_free_alloc_context, or the
+ * ocfs2_delete_inode will mutex_lock again.
+ */
+ if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+ clear_nlink(inode);
+ iput(inode);
+ }
+
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_mknod_locked(struct ocfs2_super *osb,
- struct inode *dir,
- struct inode *inode,
- dev_t dev,
- struct buffer_head **new_fe_bh,
- struct buffer_head *parent_fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *inode_ac)
+static int __ocfs2_mknod_locked(struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac,
+ u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
- u64 fe_blkno = 0;
- u16 suballoc_bit;
u16 feat;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
*new_fe_bh = NULL;
- status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
- inode_ac, &suballoc_bit, &fe_blkno);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
@@ -500,7 +530,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
if (!*new_fe_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto leave;
}
@@ -520,10 +550,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_generation = cpu_to_le32(inode->i_generation);
fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
fe->i_blkno = cpu_to_le64(fe_blkno);
+ fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
- fe->i_uid = cpu_to_le32(inode->i_uid);
- fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_uid = cpu_to_le32(i_uid_read(inode));
+ fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
@@ -532,7 +563,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
- le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(CURRENT_TIME.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -556,11 +587,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
}
- status = ocfs2_journal_dirty(handle, *new_fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, *new_fe_bh);
ocfs2_populate_inode(inode, fe, 1);
ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -570,8 +597,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
mlog_errno(status);
}
- status = 0; /* error in ocfs2_create_new_inode_locks is not
- * critical */
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
leave:
if (status < 0) {
@@ -581,35 +608,66 @@ leave:
}
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+ struct inode *dir,
+ struct inode *inode,
+ dev_t dev,
+ struct buffer_head **new_fe_bh,
+ struct buffer_head *parent_fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *inode_ac)
+{
+ int status = 0;
+ u64 suballoc_loc, fe_blkno = 0;
+ u16 suballoc_bit;
+
+ *new_fe_bh = NULL;
+
+ status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
+ inode_ac, &suballoc_loc,
+ &suballoc_bit, &fe_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ parent_fe_bh, handle, inode_ac,
+ fe_blkno, suballoc_loc, suballoc_bit);
+}
+
static int ocfs2_mkdir(struct inode *dir,
struct dentry *dentry,
- int mode)
+ umode_t mode)
{
int ret;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ OCFS2_I(dir)->ip_blkno, mode);
ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
static int ocfs2_create(struct inode *dir,
struct dentry *dentry,
- int mode,
- struct nameidata *nd)
+ umode_t mode,
+ bool excl)
{
int ret;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
@@ -626,10 +684,12 @@ static int ocfs2_link(struct dentry *old_dentry,
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ u64 old_de_ino;
- mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
- old_dentry->d_name.len, old_dentry->d_name.name,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ dentry->d_name.len, dentry->d_name.name);
if (S_ISDIR(inode->i_mode))
return -EPERM;
@@ -648,6 +708,22 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out;
}
+ err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ old_dentry->d_name.len, &old_de_ino);
+ if (err) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * Check whether another node removed the source inode while we
+ * were in the vfs.
+ */
+ if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+ err = -ENOENT;
+ goto out;
+ }
+
err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
dentry->d_name.len);
if (err)
@@ -682,6 +758,9 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+
err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
@@ -694,14 +773,7 @@ static int ocfs2_link(struct dentry *old_dentry,
ocfs2_set_links_count(fe, inode->i_nlink);
fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-
- err = ocfs2_journal_dirty(handle, fe_bh);
- if (err < 0) {
- ocfs2_add_links_count(fe, -1);
- drop_nlink(inode);
- mlog_errno(err);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno,
@@ -719,12 +791,12 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_commit;
}
- atomic_inc(&inode->i_count);
- dentry->d_op = &ocfs2_dentry_ops;
+ ihold(inode);
d_instantiate(dentry, inode);
out_commit:
ocfs2_commit_trans(osb, handle);
+ ocfs2_unblock_signals(&oldset);
out_unlock_inode:
ocfs2_inode_unlock(inode, 1);
@@ -736,7 +808,8 @@ out:
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(err);
+ if (err)
+ mlog_errno(err);
return err;
}
@@ -758,7 +831,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
return ret;
}
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
{
if (S_ISDIR(inode->i_mode)) {
if (inode->i_nlink == 2)
@@ -776,6 +849,7 @@ static int ocfs2_unlink(struct inode *dir,
{
int status;
int child_locked = 0;
+ bool is_unlinkable = false;
struct inode *inode = dentry->d_inode;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -788,19 +862,17 @@ static int ocfs2_unlink(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
- mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
dquot_initialize(dir);
BUG_ON(dentry->d_parent->d_inode != dir);
- mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
- if (inode == osb->root_inode) {
- mlog(0, "Cannot delete the root directory\n");
+ if (inode == osb->root_inode)
return -EPERM;
- }
status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
OI_LS_PARENT);
@@ -822,9 +894,10 @@ static int ocfs2_unlink(struct inode *dir,
if (OCFS2_I(inode)->ip_blkno != blkno) {
status = -ENOENT;
- mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
+ trace_ocfs2_unlink_noent(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno,
+ OCFS2_I(inode)->ip_flags);
goto leave;
}
@@ -851,7 +924,7 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- if (inode_is_unlinkable(inode)) {
+ if (ocfs2_inode_is_unlinkable(inode)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(inode)->ip_blkno,
orphan_name, &orphan_insert);
@@ -859,6 +932,7 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
goto leave;
}
+ is_unlinkable = true;
}
handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -878,15 +952,6 @@ static int ocfs2_unlink(struct inode *dir,
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- if (inode_is_unlinkable(inode)) {
- status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
- }
-
/* delete the name from the parent dir */
status = ocfs2_delete_entry(handle, dir, &lookup);
if (status < 0) {
@@ -898,12 +963,7 @@ static int ocfs2_unlink(struct inode *dir,
drop_nlink(inode);
drop_nlink(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_journal_dirty(handle, fe_bh);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (S_ISDIR(inode->i_mode))
@@ -914,6 +974,14 @@ static int ocfs2_unlink(struct inode *dir,
mlog_errno(status);
if (S_ISDIR(inode->i_mode))
inc_nlink(dir);
+ goto leave;
+ }
+
+ if (is_unlinkable) {
+ status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+ orphan_name, &orphan_insert, orphan_dir);
+ if (status < 0)
+ mlog_errno(status);
}
leave:
@@ -938,11 +1006,71 @@ leave:
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
+ mlog_errno(status);
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog(ML_NOTICE, "max lookup times reached, filesystem "
+ "may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
@@ -954,25 +1082,41 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
struct inode *inode2)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
struct inode *tmpinode;
- mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
- (unsigned long long)oi1->ip_blkno,
- (unsigned long long)oi2->ip_blkno);
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
if (*bh1)
*bh1 = NULL;
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
+
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
- mlog(0, "switching them around...\n");
tmpbh = bh2;
bh2 = bh1;
bh1 = tmpbh;
@@ -998,15 +1142,23 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
* An error return must mean that no cluster locks
* were held on function exit.
*/
- if (oi1->ip_blkno != oi2->ip_blkno)
+ if (oi1->ip_blkno != oi2->ip_blkno) {
ocfs2_inode_unlock(inode2, 1);
+ brelse(*bh2);
+ *bh2 = NULL;
+ }
if (status != -ENOENT)
mlog_errno(status);
}
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1037,21 +1189,21 @@ static int ocfs2_rename(struct inode *old_dir,
handle_t *handle = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *new_dir_bh = NULL;
- nlink_t old_dir_nlink = old_dir->i_nlink;
+ u32 old_dir_nlink = old_dir->i_nlink;
struct ocfs2_dinode *old_di;
struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
struct ocfs2_dir_lookup_result target_insert = { NULL, };
+ bool should_add_orphan = false;
/* At some point it might be nice to break this function up a
* bit. */
- mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
- old_dir, old_dentry, new_dir, new_dentry,
- old_dentry->d_name.len, old_dentry->d_name.name,
- new_dentry->d_name.len, new_dentry->d_name.name);
+ trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ new_dentry->d_name.len, new_dentry->d_name.name);
dquot_initialize(old_dir);
dquot_initialize(new_dir);
@@ -1081,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;
+
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ trace_ocfs2_rename_not_permitted(
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
@@ -1208,16 +1375,15 @@ static int ocfs2_rename(struct inode *old_dir,
if (!new_inode) {
status = -EACCES;
- mlog(0, "We found an inode for name %.*s but VFS "
- "didn't give us one.\n", new_dentry->d_name.len,
- new_dentry->d_name.name);
+ trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+ new_dentry->d_name.name);
goto bail;
}
if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
status = -EACCES;
- mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
+ trace_ocfs2_rename_disagree(
(unsigned long long)OCFS2_I(new_inode)->ip_blkno,
(unsigned long long)newfe_blkno,
OCFS2_I(new_inode)->ip_flags);
@@ -1240,8 +1406,7 @@ static int ocfs2_rename(struct inode *old_dir,
newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
- mlog(0, "aha rename over existing... new_blkno=%llu "
- "newfebh=%p bhblocknr=%llu\n",
+ trace_ocfs2_rename_over_existing(
(unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
(unsigned long long)newfe_bh->b_blocknr : 0ULL);
@@ -1253,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
mlog_errno(status);
goto bail;
}
+ should_add_orphan = true;
}
} else {
BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1297,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (S_ISDIR(new_inode->i_mode) ||
- (ocfs2_read_links_count(newfe) == 1)) {
- status = ocfs2_orphan_add(osb, handle, new_inode,
- newfe, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
/* change the dirent to point to the correct inode */
status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
old_inode);
@@ -1321,11 +1476,15 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_set_links_count(newfe, 0);
else
ocfs2_add_links_count(newfe, -1);
-
- status = ocfs2_journal_dirty(handle, newfe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ ocfs2_journal_dirty(handle, newfe_bh);
+ if (should_add_orphan) {
+ status = ocfs2_orphan_add(osb, handle, new_inode,
+ newfe_bh, orphan_name,
+ &orphan_insert, orphan_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
} else {
/* if the name was not found in new_dir, add it now */
@@ -1345,10 +1504,7 @@ static int ocfs2_rename(struct inode *old_dir,
old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
-
- status = ocfs2_journal_dirty(handle, old_inode_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1372,7 +1528,7 @@ static int ocfs2_rename(struct inode *old_dir,
}
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
new_inode->i_ctime = CURRENT_TIME;
}
old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
@@ -1380,9 +1536,9 @@ static int ocfs2_rename(struct inode *old_dir,
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
&old_inode_dot_dot_res, new_dir);
- old_dir->i_nlink--;
+ drop_nlink(old_dir);
if (new_inode) {
- new_inode->i_nlink--;
+ drop_nlink(new_inode);
} else {
inc_nlink(new_dir);
mark_inode_dirty(new_dir);
@@ -1420,7 +1576,7 @@ static int ocfs2_rename(struct inode *old_dir,
OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
ocfs2_set_links_count(fe, old_dir->i_nlink);
- status = ocfs2_journal_dirty(handle, old_dir_bh);
+ ocfs2_journal_dirty(handle, old_dir_bh);
}
}
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1465,7 +1621,8 @@ bail:
brelse(old_dir_bh);
brelse(new_dir_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1490,9 +1647,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
* write i_size + 1 bytes. */
blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
- mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
- (unsigned long long)inode->i_blocks,
- i_size_read(inode), blocks);
+ trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+ i_size_read(inode), blocks);
/* Sanity check -- make sure we're going to fit. */
if (bytes_left >
@@ -1552,11 +1708,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
(bytes_left > sb->s_blocksize) ? sb->s_blocksize :
bytes_left);
- status = ocfs2_journal_dirty(handle, bhs[virtual]);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bhs[virtual]);
virtual++;
p_blkno++;
@@ -1572,7 +1724,8 @@ bail:
kfree(bhs);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1600,9 +1753,12 @@ static int ocfs2_symlink(struct inode *dir,
};
int did_quota = 0, did_quota_inode = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
+ sigset_t oldset;
+ int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
- mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
- dentry, symname, dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_symlink_begin(dir, dentry, symname,
+ dentry->d_name.len, dentry->d_name.name);
dquot_initialize(dir);
@@ -1656,7 +1812,7 @@ static int ocfs2_symlink(struct inode *dir,
}
/* get security xattr */
- status = ocfs2_init_security_get(inode, dir, &si);
+ status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
if (status) {
if (status == -EOPNOTSUPP)
si.enable = 0;
@@ -1695,14 +1851,19 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
+ /* Starting to change things, restart is no longer possible. */
+ ocfs2_block_signals(&oldset);
+ did_block_signals = 1;
+
status = dquot_alloc_inode(inode);
if (status)
goto bail;
did_quota_inode = 1;
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
- inode->i_mode, dentry->d_name.len,
- dentry->d_name.name);
+ trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+ dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ inode->i_mode);
status = ocfs2_mknod_locked(osb, dir, inode,
0, &new_fe_bh, parent_fe_bh, handle,
@@ -1715,15 +1876,16 @@ static int ocfs2_symlink(struct inode *dir,
fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
inode->i_rdev = 0;
newsize = l - 1;
+ inode->i_op = &ocfs2_symlink_inode_operations;
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
- inode->i_op = &ocfs2_symlink_inode_operations;
status = dquot_alloc_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status)
goto bail;
did_quota = 1;
+ inode->i_mapping->a_ops = &ocfs2_aops;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
handle, data_ac, NULL,
@@ -1741,7 +1903,7 @@ static int ocfs2_symlink(struct inode *dir,
i_size_write(inode, newsize);
inode->i_blocks = ocfs2_inode_sector_count(inode);
} else {
- inode->i_op = &ocfs2_fast_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
memcpy((char *) fe->id2.i_symlink, symname, l);
i_size_write(inode, newsize);
inode->i_blocks = 0;
@@ -1771,22 +1933,28 @@ static int ocfs2_symlink(struct inode *dir,
}
}
- status = ocfs2_add_entry(handle, dentry, inode,
- le64_to_cpu(fe->i_blkno), parent_fe_bh,
- &lookup);
- if (status < 0) {
+ /*
+ * Do this before adding the entry to the directory. We add
+ * also set d_op after success so that ->d_iput() will cleanup
+ * the dentry lock even if ocfs2_add_entry() fails below.
+ */
+ status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+ if (status) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
- if (status) {
+ dl = dentry->d_fsdata;
+
+ status = ocfs2_add_entry(handle, dentry, inode,
+ le64_to_cpu(fe->i_blkno), parent_fe_bh,
+ &lookup);
+ if (status < 0) {
mlog_errno(status);
goto bail;
}
insert_inode_hash(inode);
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
bail:
if (status < 0 && did_quota)
@@ -1798,10 +1966,11 @@ bail:
ocfs2_commit_trans(osb, handle);
ocfs2_inode_unlock(dir, 1);
+ if (did_block_signals)
+ ocfs2_unblock_signals(&oldset);
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
if (inode_ac)
@@ -1811,11 +1980,16 @@ bail:
if (xattr_ac)
ocfs2_free_alloc_context(xattr_ac);
if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
+ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1824,8 +1998,6 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
{
int status, namelen;
- mlog_entry_void();
-
namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
(long long)blkno);
if (namelen <= 0) {
@@ -1842,76 +2014,133 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
goto bail;
}
- mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
- namelen);
+ trace_ocfs2_blkno_stringify(blkno, name, namelen);
status = 0;
bail:
- mlog_exit(status);
+ if (status < 0)
+ mlog_errno(status);
return status;
}
-static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
- struct inode **ret_orphan_dir,
- u64 blkno,
- char *name,
- struct ocfs2_dir_lookup_result *lookup)
+static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ struct buffer_head **ret_orphan_dir_bh)
{
struct inode *orphan_dir_inode;
struct buffer_head *orphan_dir_bh = NULL;
- int status = 0;
-
- status = ocfs2_blkno_stringify(blkno, name);
- if (status < 0) {
- mlog_errno(status);
- return status;
- }
+ int ret = 0;
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -ENOENT;
- mlog_errno(status);
- return status;
+ ret = -ENOENT;
+ mlog_errno(ret);
+ return ret;
}
mutex_lock(&orphan_dir_inode->i_mutex);
- status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
+ ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (ret < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+
+ mlog_errno(ret);
+ return ret;
}
- status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
- orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
- if (status < 0) {
- ocfs2_inode_unlock(orphan_dir_inode, 1);
+ *ret_orphan_dir = orphan_dir_inode;
+ *ret_orphan_dir_bh = orphan_dir_bh;
- mlog_errno(status);
- goto leave;
+ return 0;
+}
+
+static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
+ struct buffer_head *orphan_dir_bh,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+
+ ret = ocfs2_blkno_stringify(blkno, name);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
+ orphan_dir_bh, name,
+ OCFS2_ORPHAN_NAMELEN, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for
+ * insertion of an orphan.
+ * @osb: ocfs2 file system
+ * @ret_orphan_dir: Orphan dir inode - returned locked!
+ * @blkno: Actual block number of the inode to be inserted into orphan dir.
+ * @lookup: dir lookup result, to be passed back into functions like
+ * ocfs2_orphan_add
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+ struct inode **ret_orphan_dir,
+ u64 blkno,
+ char *name,
+ struct ocfs2_dir_lookup_result *lookup)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ int ret = 0;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode,
+ &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
+ blkno, name, lookup);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
}
*ret_orphan_dir = orphan_dir_inode;
-leave:
- if (status) {
+out:
+ brelse(orphan_dir_bh);
+
+ if (ret) {
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
}
- brelse(orphan_dir_bh);
-
- mlog_exit(status);
- return status;
+ if (ret)
+ mlog_errno(ret);
+ return ret;
}
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct ocfs2_dinode *fe,
+ struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
struct inode *orphan_dir_inode)
@@ -1919,8 +2148,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh = NULL;
int status = 0;
struct ocfs2_dinode *orphan_fe;
+ struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
- mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+ trace_ocfs2_orphan_add_begin(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
if (status < 0) {
@@ -1937,18 +2168,28 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto leave;
}
+ /*
+ * We're going to journal the change of i_flags and i_orphaned_slot.
+ * It's safe anyway, though some callers may duplicate the journaling.
+ * Journaling within the func just make the logic look more
+ * straightforward.
+ */
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* we're a cluster, and nlink can change on disk from
* underneath us... */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, 1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
OCFS2_ORPHAN_NAMELEN, inode,
@@ -1956,23 +2197,32 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
orphan_dir_bh, lookup);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto rollback;
}
- le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+ OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
- mlog(0, "Inode %llu orphaned in slot %d\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
+ ocfs2_journal_dirty(handle, fe_bh);
+
+ trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ osb->slot_num);
+
+rollback:
+ if (status < 0) {
+ if (S_ISDIR(inode->i_mode))
+ ocfs2_add_links_count(orphan_fe, -1);
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ }
leave:
brelse(orphan_dir_bh);
- mlog_exit(status);
return status;
}
@@ -1988,17 +2238,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
int status = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry_void();
-
status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
- name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- OCFS2_ORPHAN_NAMELEN);
+ trace_ocfs2_orphan_del(
+ (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+ name, OCFS2_ORPHAN_NAMELEN);
/* find it's spot in the orphan directory */
status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
@@ -2028,21 +2276,110 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
ocfs2_add_links_count(orphan_fe, -1);
- orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
-
- status = ocfs2_journal_dirty(handle, orphan_dir_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+ ocfs2_journal_dirty(handle, orphan_dir_bh);
leave:
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+/**
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
+ * allocated file. This is different from the typical 'add to orphan dir'
+ * operation in that the inode does not yet exist. This is a problem because
+ * the orphan dir stringifies the inode block number to come up with it's
+ * dirent. Obviously if the inode does not yet exist we have a chicken and egg
+ * problem. This function works around it by calling deeper into the orphan
+ * and suballoc code than other callers. Use this only by necessity.
+ * @dir: The directory which this inode will ultimately wind up under - not the
+ * orphan dir!
+ * @dir_bh: buffer_head the @dir inode block
+ * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled
+ * with the string to be used for orphan dirent. Pass back to the orphan dir
+ * code.
+ * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan
+ * dir code.
+ * @ret_di_blkno: block number where the new inode will be allocated.
+ * @orphan_insert: Dir insert context to be passed back into orphan dir code.
+ * @ret_inode_ac: Inode alloc context to be passed back to the allocator.
+ *
+ * Returns zero on success and the ret_orphan_dir, name and lookup
+ * fields will be populated.
+ *
+ * Returns non-zero on failure.
+ */
+static int ocfs2_prep_new_orphaned_file(struct inode *dir,
+ struct buffer_head *dir_bh,
+ char *orphan_name,
+ struct inode **ret_orphan_dir,
+ u64 *ret_di_blkno,
+ struct ocfs2_dir_lookup_result *orphan_insert,
+ struct ocfs2_alloc_context **ret_inode_ac)
+{
+ int ret;
+ u64 di_blkno;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ struct inode *orphan_dir = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct ocfs2_alloc_context *inode_ac = NULL;
+
+ ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* reserve an inode spot */
+ ret = ocfs2_reserve_new_inode(osb, &inode_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac,
+ &di_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
+ di_blkno, orphan_name, orphan_insert);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret == 0) {
+ *ret_orphan_dir = orphan_dir;
+ *ret_di_blkno = di_blkno;
+ *ret_inode_ac = inode_ac;
+ /*
+ * orphan_name and orphan_insert are already up to
+ * date via prepare_orphan_dir
+ */
+ } else {
+ /* Unroll reserve_new_inode* */
+ if (inode_ac)
+ ocfs2_free_alloc_context(inode_ac);
+
+ /* Unroll orphan dir locking */
+ mutex_unlock(&orphan_dir->i_mutex);
+ ocfs2_inode_unlock(orphan_dir, 1);
+ iput(orphan_dir);
+ }
+
+ brelse(orphan_dir_bh);
+
+ return ret;
+}
+
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode)
@@ -2058,6 +2395,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head *new_di_bh = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ u64 uninitialized_var(di_blkno), suballoc_loc;
+ u16 suballoc_bit;
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
if (status < 0) {
@@ -2066,20 +2405,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
return status;
}
- /*
- * We give the orphan dir the root blkno to fake an orphan name,
- * and allocate enough space for our insertion.
- */
- status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
- osb->root_blkno,
- orphan_name, &orphan_insert);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- /* reserve an inode spot */
- status = ocfs2_reserve_new_inode(osb, &inode_ac);
+ status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh,
+ orphan_name, &orphan_dir,
+ &di_blkno, &orphan_insert, &inode_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -2106,24 +2434,27 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
did_quota_inode = 1;
- inode->i_nlink = 0;
- /* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, inode,
- 0, &new_di_bh, parent_di_bh, handle,
- inode_ac);
+ status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac,
+ &suballoc_loc,
+ &suballoc_bit, di_blkno);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+ clear_nlink(inode);
+ /* do the real work now. */
+ status = __ocfs2_mknod_locked(dir, inode,
+ 0, &new_di_bh, parent_di_bh, handle,
+ inode_ac, di_blkno, suballoc_loc,
+ suballoc_bit);
if (status < 0) {
mlog_errno(status);
goto leave;
}
di = (struct ocfs2_dinode *)new_di_bh->b_data;
- status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+ status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
&orphan_insert, orphan_dir);
if (status < 0) {
mlog_errno(status);
@@ -2149,9 +2480,6 @@ leave:
iput(orphan_dir);
}
- if (status == -ENOSPC)
- mlog(0, "Disk is full\n");
-
if ((status < 0) && inode) {
clear_nlink(inode);
iput(inode);
@@ -2186,8 +2514,10 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct buffer_head *di_bh = NULL;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
- dentry->d_name.len, dentry->d_name.name);
+ trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+ dentry->d_name.len, dentry->d_name.name,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
if (status < 0) {
@@ -2265,10 +2595,11 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+ di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
di->i_orphaned_slot = 0;
- inode->i_nlink = 1;
+ set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
@@ -2286,7 +2617,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto out_commit;
}
- dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
status = 0;
out_commit:
@@ -2305,7 +2635,8 @@ leave:
ocfs2_free_dir_lookup_result(&lookup);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2328,4 +2659,6 @@ const struct inode_operations ocfs2_dir_iops = {
.listxattr = ocfs2_listxattr,
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
+ .get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db9..bbec539230f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
+#include <linux/llist.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
@@ -47,6 +48,7 @@
/* For struct ocfs2_blockcheck_stats */
#include "blockcheck.h"
+#include "reservations.h"
/* Caching of metadata buffers */
@@ -146,43 +148,55 @@ struct ocfs2_lock_res_ops;
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+ u64 ls_total; /* Total wait in NSEC */
+ u32 ls_gets; /* Num acquires */
+ u32 ls_fail; /* Num failed acquires */
+
+ /* Storing max wait in usecs saves 24 bytes per inode */
+ u32 ls_max; /* Max wait in USEC */
+};
+#endif
+
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
- spinlock_t l_lock;
+
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
- enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
- int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct ocfs2_dlm_lksb l_lksb;
+ signed char l_level;
+ signed char l_requested;
+ signed char l_blocking;
+
+ /* Data packed - type enum ocfs2_lock_type */
+ unsigned char l_type;
/* used from AST/BAST funcs. */
- enum ocfs2_ast_action l_action;
- enum ocfs2_unlock_action l_unlock_action;
- int l_requested;
- int l_blocking;
+ /* Data packed - enum type ocfs2_ast_action */
+ unsigned char l_action;
+ /* Data packed - enum type ocfs2_unlock_action */
+ unsigned char l_unlock_action;
unsigned int l_pending_gen;
+ spinlock_t l_lock;
+
+ struct ocfs2_dlm_lksb l_lksb;
+
wait_queue_head_t l_event;
struct list_head l_debug_list;
#ifdef CONFIG_OCFS2_FS_STATS
- unsigned long long l_lock_num_prmode; /* PR acquires */
- unsigned long long l_lock_num_exmode; /* EX acquires */
- unsigned int l_lock_num_prmode_failed; /* Failed PR gets */
- unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
- unsigned long long l_lock_total_prmode; /* Tot wait for PR */
- unsigned long long l_lock_total_exmode; /* Tot wait for EX */
- unsigned int l_lock_max_prmode; /* Max wait for PR */
- unsigned int l_lock_max_exmode; /* Max wait for EX */
- unsigned int l_lock_refresh; /* Disk refreshes */
+ struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */
+ u32 l_lock_refresh; /* Disk refreshes */
+ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map l_lockdep_map;
@@ -242,7 +256,7 @@ enum ocfs2_local_alloc_state
enum ocfs2_mount_options
{
- OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
+ OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -255,28 +269,30 @@ enum ocfs2_mount_options
control lists */
OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+ writes */
+ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
-#define OCFS2_OSB_SOFT_RO 0x0001
-#define OCFS2_OSB_HARD_RO 0x0002
-#define OCFS2_OSB_ERROR_FS 0x0004
-#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008
-
-#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+#define OCFS2_OSB_SOFT_RO 0x0001
+#define OCFS2_OSB_HARD_RO 0x0002
+#define OCFS2_OSB_ERROR_FS 0x0004
+#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
struct ocfs2_replay_map;
struct ocfs2_quota_recovery;
-struct ocfs2_dentry_lock;
struct ocfs2_super
{
struct task_struct *commit_task;
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
- struct inode *system_inodes[NUM_SYSTEM_INODES];
+ struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+ struct inode **local_system_inodes;
struct ocfs2_slot_info *slot_info;
@@ -329,7 +345,6 @@ struct ocfs2_super
struct task_struct *recovery_thread_task;
int disable_recovery;
wait_queue_head_t checkpoint_event;
- atomic_t needs_checkpoint;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
@@ -341,6 +356,9 @@ struct ocfs2_super
*/
unsigned int local_alloc_bits;
unsigned int local_alloc_default_bits;
+ /* osb_clusters_at_boot can become stale! Do not trust it to
+ * be up to date. */
+ unsigned int osb_clusters_at_boot;
enum ocfs2_local_alloc_state local_alloc_state; /* protected
* by osb_lock */
@@ -349,6 +367,11 @@ struct ocfs2_super
u64 la_last_gd;
+ struct ocfs2_reservation_map osb_la_resmap;
+
+ unsigned int osb_resv_level;
+ unsigned int osb_dir_resv_level;
+
/* Next three fields are for local node slot recovery during
* mount. */
int dirty;
@@ -359,7 +382,10 @@ struct ocfs2_super
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
+ u8 osb_stackflags;
+
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
@@ -386,10 +412,9 @@ struct ocfs2_super
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
- /* List of dentry locks to release. Anyone can add locks to
- * the list, ocfs2_wq processes the list */
- struct ocfs2_dentry_lock *dentry_lock_list;
- struct work_struct dentry_lock_work;
+ /* List of dquot structures to drop last reference to */
+ struct llist_head dquot_drop_list;
+ struct work_struct dquot_drop_work;
wait_queue_head_t osb_mount_event;
@@ -397,6 +422,12 @@ struct ocfs2_super
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct delayed_work osb_truncate_log_wq;
+ atomic_t osb_tl_disable;
+ /*
+ * How many clusters in our truncate log.
+ * It must be protected by osb_tl_inode->i_mutex.
+ */
+ unsigned int truncated_clusters;
struct ocfs2_node_map osb_recovering_orphan_dirs;
unsigned int *osb_orphan_wipes;
@@ -416,6 +447,8 @@ struct ocfs2_super
/* rb tree root for refcount lock. */
struct rb_root osb_rf_lock_tree;
struct ocfs2_refcount_tree *osb_ref_tree_lru;
+
+ struct mutex system_file_mutex;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -482,6 +515,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+ return 1;
+ return 0;
+}
+
static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
{
if (ocfs2_supports_indexed_dirs(osb))
@@ -539,18 +579,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
spin_unlock(&osb->osb_lock);
}
-
-static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
- unsigned long flag)
-{
- unsigned long ret;
-
- spin_lock(&osb->osb_lock);
- ret = osb->osb_flags & flag;
- spin_unlock(&osb->osb_lock);
- return ret;
-}
-
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
@@ -585,10 +613,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
return ret;
}
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat &
- OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+ (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+ OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+ return ocfs2_o2cb_stack(osb) &&
+ (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
}
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
@@ -763,10 +816,73 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
-#define ocfs2_test_bit ext2_test_bit
-#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
-#define ocfs2_find_next_bit ext2_find_next_bit
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+ unsigned int clusters)
+{
+ return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+ __set_bit_le(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+ __clear_bit_le(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
+
+#define ocfs2_test_bit test_bit_le
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
+#define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+ bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+ return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+ int start)
+{
+ int fix = 0, ret, tmpmax;
+ bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a797..938387a10d5 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,9 @@
| OCFS2_FEATURE_INCOMPAT_XATTR \
| OCFS2_FEATURE_INCOMPAT_META_ECC \
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
- | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +167,16 @@
/* Refcount tree support */
#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
+
+/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -231,18 +243,31 @@
#define OCFS2_HAS_REFCOUNT_FL (0x0010)
/* Inode attributes, keep in sync with EXT2 */
-#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
-#define OCFS2_UNRM_FL (0x00000002) /* Undelete */
-#define OCFS2_COMPR_FL (0x00000004) /* Compress file */
-#define OCFS2_SYNC_FL (0x00000008) /* Synchronous updates */
-#define OCFS2_IMMUTABLE_FL (0x00000010) /* Immutable file */
-#define OCFS2_APPEND_FL (0x00000020) /* writes to file may only append */
-#define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */
-#define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */
-#define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */
-
-#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */
-#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */
+#define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */
+#define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */
+#define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */
+#define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */
+#define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */
+#define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */
+#define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */
+/* Reserved for compression usage... */
+#define OCFS2_DIRTY_FL FS_DIRTY_FL
+#define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */
+#define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */
+#define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */
+/* End compression flags --- maybe not all used */
+#define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */
+#define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */
+#define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */
+#define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */
+#define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */
+#define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */
+#define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/
+#define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */
+
+#define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */
+#define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */
/*
* Extent record flags (e_node.leaf.flags)
@@ -275,27 +300,27 @@
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
#define OCFS2_STACK_LABEL_LEN 4
#define OCFS2_CLUSTER_NAME_LEN 16
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
+
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
/*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
-
-/*
* Inline extended attribute size (in bytes)
* The value chosen should be aligned to 16 byte boundaries.
*/
#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -313,6 +338,7 @@ enum {
USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE,
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
@@ -321,8 +347,12 @@ enum {
TRUNCATE_LOG_SYSTEM_INODE,
LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
NUM_SYSTEM_INODES
};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES \
+ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Global system inodes (single copy) */
@@ -351,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Parameter passed from mount.ocfs2 to module */
#define OCFS2_HB_NONE "heartbeat=none"
#define OCFS2_HB_LOCAL "heartbeat=local"
+#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
* OCFS2 directory file types. Only the low 3 bits are used. The
@@ -410,7 +441,7 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
struct ocfs2_block_check {
/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
__le16 bc_ecc; /* Single-error-correction parity vector.
- This is a simple Hamming code dependant
+ This is a simple Hamming code dependent
on the blocksize. OCFS2's maximum
blocksize, 4K, requires 16 parity bits,
so we fit in __le16. */
@@ -512,7 +543,10 @@ struct ocfs2_extent_block
block group */
__le32 h_fs_generation; /* Must match super block */
__le64 h_blkno; /* Offset on disk, in blocks */
-/*20*/ __le64 h_reserved3;
+/*20*/ __le64 h_suballoc_loc; /* Suballocator block group this
+ eb belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
__le64 h_next_leaf_blk; /* Offset on disk, in blocks,
of next leaf header pointing
to data */
@@ -554,9 +588,21 @@ struct ocfs2_slot_map_extended {
*/
};
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
struct ocfs2_cluster_info {
/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
- __le32 ci_reserved;
+ union {
+ __le32 ci_reserved;
+ struct {
+ __u8 ci_stackflags;
+ __u8 ci_reserved1;
+ __u8 ci_reserved2;
+ __u8 ci_reserved3;
+ };
+ };
/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
/*18*/
};
@@ -593,9 +639,9 @@ struct ocfs2_super_block {
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
-/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
- stack. Only valid
- with INCOMPAT flag. */
+/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+ userspace or clusterinfo
+ INCOMPAT flag set. */
/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
for this fs*/
__le16 s_reserved0;
@@ -679,7 +725,11 @@ struct ocfs2_dinode {
/*80*/ struct ocfs2_block_check i_check; /* Error checking */
/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
/*90*/ __le64 i_refcount_loc;
- __le64 i_reserved2[4];
+ __le64 i_suballoc_loc; /* Suballocator block group this
+ inode belongs to. Only valid
+ if allocated from a
+ discontiguous block group */
+/*A0*/ __le64 i_reserved2[3];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -700,7 +750,7 @@ struct ocfs2_dinode {
after an unclean
shutdown */
} journal1;
- } id1; /* Inode type dependant 1 */
+ } id1; /* Inode type dependent 1 */
/*C0*/ union {
struct ocfs2_super_block i_super;
struct ocfs2_local_alloc i_lab;
@@ -814,7 +864,12 @@ struct ocfs2_dx_root_block {
__le32 dr_reserved2;
__le64 dr_free_blk; /* Pointer to head of free
* unindexed block list. */
- __le64 dr_reserved3[15];
+ __le64 dr_suballoc_loc; /* Suballocator block group
+ this root belongs to.
+ Only valid if allocated
+ from a discontiguous
+ block group */
+ __le64 dr_reserved3[14];
union {
struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
* bits for maximum space
@@ -840,6 +895,13 @@ struct ocfs2_dx_leaf {
};
/*
+ * Largest bitmap for a block (suballocator) group in bytes. This limit
+ * does not affect cluster groups (global allocator). Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE 256
+
+/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
@@ -860,7 +922,29 @@ struct ocfs2_group_desc
__le64 bg_blkno; /* Offset on disk, in blocks */
/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
__le64 bg_reserved2;
-/*40*/ __u8 bg_bitmap[0];
+/*40*/ union {
+ __u8 bg_bitmap[0];
+ struct {
+ /*
+ * Block groups may be discontiguous when
+ * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+ * The extents of a discontigous block group are
+ * stored in bg_list. It is a flat list.
+ * l_tree_depth must always be zero. A
+ * discontiguous group is signified by a non-zero
+ * bg_list->l_next_free_rec. Only block groups
+ * can be discontiguous; Cluster groups cannot.
+ * We've never made a block group with more than
+ * 2048 blocks (256 bytes of bg_bitmap). This
+ * codifies that limit so that we can fit bg_list.
+ * bg_size of a discontiguous block group will
+ * be 256 to match bg_bitmap_filler.
+ */
+ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/ struct ocfs2_extent_list bg_list;
+ };
+ };
+/* Actual on-disk size is one block */
};
struct ocfs2_refcount_rec {
@@ -905,7 +989,11 @@ struct ocfs2_refcount_block {
/*40*/ __le32 rf_generation; /* generation number. all be the same
* for the same refcount tree. */
__le32 rf_reserved0;
- __le64 rf_reserved1[7];
+ __le64 rf_suballoc_loc; /* Suballocator block group this
+ refcount block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
+/*50*/ __le64 rf_reserved1[6];
/*80*/ union {
struct ocfs2_refcount_list rf_records; /* List of refcount
records */
@@ -931,7 +1019,7 @@ struct ocfs2_xattr_entry {
__le16 xe_name_offset; /* byte offset from the 1st entry in the
local xattr storage(inode, xattr block or
xattr bucket). */
- __u8 xe_name_len; /* xattr name len, does't include prefix. */
+ __u8 xe_name_len; /* xattr name len, doesn't include prefix. */
__u8 xe_type; /* the low 7 bits indicate the name prefix
* type and the highest bit indicates whether
* the EA is stored in the local storage. */
@@ -1017,7 +1105,10 @@ struct ocfs2_xattr_block {
real xattr or a xattr tree. */
__le16 xb_reserved0;
__le32 xb_reserved1;
- __le64 xb_reserved2;
+ __le64 xb_suballoc_loc; /* Suballocator block group this
+ xattr block belongs to. Only
+ valid if allocated from a
+ discontiguous block group */
/*30*/ union {
struct ocfs2_xattr_header xb_header; /* xattr header if this
block contains xattr */
@@ -1254,6 +1345,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
{
int size;
@@ -1284,13 +1385,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
return size;
}
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+ int suballocator,
+ u32 feature_incompat)
{
- int size;
-
- size = sb->s_blocksize -
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -1402,23 +1513,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
{
int size;
size = blocksize -
- offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+ offsetof(struct ocfs2_group_desc, bg_list.l_recs);
- return size;
+ return size / sizeof(struct ocfs2_extent_rec);
}
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
{
int size;
size = blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+
+ return size;
+}
+
+static inline int ocfs2_group_bitmap_size(int blocksize,
+ int suballocator,
+ uint32_t feature_incompat)
+{
+ int size = sb->s_blocksize -
offsetof(struct ocfs2_group_desc, bg_bitmap);
+ /*
+ * The cluster allocator uses the entire block. Suballocators have
+ * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older
+ * code expects bg_size set to the maximum. Thus we must keep
+ * bg_size as-is unless discontig_bg is enabled.
+ */
+ if (suballocator &&
+ (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+ size = OCFS2_MAX_BG_BITMAP_SIZE;
+
return size;
}
@@ -1491,5 +1622,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+ if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+ le16_to_cpu(gd->bg_size)) !=
+ offsetof(struct ocfs2_group_desc, bg_list))
+ return 0;
+ /*
+ * Only valid to check l_next_free_rec if
+ * bg_bitmap + bg_size == bg_list.
+ */
+ if (!gd->bg_list.l_next_free_rec)
+ return 0;
+ return 1;
+}
#endif /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 2d3420af1a8..5b27ff1fa57 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -23,10 +23,10 @@
/*
* ioctl commands
*/
-#define OCFS2_IOC_GETFLAGS _IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS _IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS _IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
+#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
/*
* Space reservation / allocation / free ioctls and argument structure
@@ -76,4 +76,167 @@ struct reflink_arguments {
};
#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST (50)
+#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+ __u64 oi_requests; /* Array of __u64 pointers to requests */
+ __u32 oi_count; /* Number of requests in info_requests */
+ __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+ __u32 ir_code; /* Info request code */
+ __u32 ir_size; /* Size of request */
+ __u32 ir_flags; /* Request flags */
+/*10*/ /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+ struct ocfs2_info_request ic_req;
+ __u32 ic_clustersize;
+ __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+ struct ocfs2_info_request ib_req;
+ __u32 ib_blocksize;
+ __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+ struct ocfs2_info_request im_req;
+ __u32 im_max_slots;
+ __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+ struct ocfs2_info_request il_req;
+ __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+ struct ocfs2_info_request iu_req;
+ __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+ struct ocfs2_info_request if_req;
+ __u32 if_compat_features;
+ __u32 if_incompat_features;
+ __u32 if_ro_compat_features;
+ __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+ struct ocfs2_info_request ij_req;
+ __u64 ij_journal_size;
+};
+
+struct ocfs2_info_freeinode {
+ struct ocfs2_info_request ifi_req;
+ struct ocfs2_info_local_freeinode {
+ __u64 lfi_total;
+ __u64 lfi_free;
+ } ifi_stat[OCFS2_MAX_SLOTS];
+ __u32 ifi_slotnum; /* out */
+ __u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST (32)
+
+struct ocfs2_info_freefrag {
+ struct ocfs2_info_request iff_req;
+ struct ocfs2_info_freefrag_stats { /* (out) */
+ struct ocfs2_info_free_chunk_list {
+ __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+ __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+ } ffs_fc_hist;
+ __u32 ffs_clusters;
+ __u32 ffs_free_clusters;
+ __u32 ffs_free_chunks;
+ __u32 ffs_free_chunks_real;
+ __u32 ffs_min; /* Minimum free chunksize in clusters */
+ __u32 ffs_max;
+ __u32 ffs_avg;
+ __u32 ffs_pad;
+ } iff_ffs;
+ __u32 iff_chunksize; /* chunksize in clusters(in) */
+ __u32 iff_pad;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+ OCFS2_INFO_CLUSTERSIZE = 1,
+ OCFS2_INFO_BLOCKSIZE,
+ OCFS2_INFO_MAXSLOTS,
+ OCFS2_INFO_LABEL,
+ OCFS2_INFO_UUID,
+ OCFS2_INFO_FS_FEATURES,
+ OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_FREEINODE,
+ OCFS2_INFO_FREEFRAG,
+ OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
+ required. This is a hint.
+ It is up to ocfs2 whether
+ the request can be fulfilled
+ without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
+ this request and
+ filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
+ request handling. */
+
+#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+
+struct ocfs2_move_extents {
+/* All values are in bytes */
+ /* in */
+ __u64 me_start; /* Virtual start in the file to move */
+ __u64 me_len; /* Length of the extents to be moved */
+ __u64 me_goal; /* Physical offset of the goal,
+ it's in block unit */
+ __u64 me_threshold; /* Maximum distance from goal or threshold
+ for auto defragmentation */
+ __u64 me_flags; /* Flags for the operation:
+ * - auto defragmentation.
+ * - refcount,xattr cases.
+ */
+ /* out */
+ __u64 me_moved_len; /* Moved/defraged length */
+ __u64 me_new_offset; /* Resulting physical location */
+ __u32 me_reserved[2]; /* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG (0x00000001) /* Kernel manages to
+ claim new clusters
+ as the goal place
+ for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG (0x00000002) /* Allow partial extent
+ moving, is to make
+ movement less likely
+ to fail, may make fs
+ even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE (0x00000004) /* Move or defragmenation
+ completely gets done.
+ */
+
+#define OCFS2_IOC_MOVE_EXT _IOW('o', 6, struct ocfs2_move_extents)
+
#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 00000000000..6cb019b7c6a
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2768 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocfs2__int,
+ TP_PROTO(int num),
+ TP_ARGS(num),
+ TP_STRUCT__entry(
+ __field(int, num)
+ ),
+ TP_fast_assign(
+ __entry->num = num;
+ ),
+ TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__int, name, \
+ TP_PROTO(int num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__uint,
+ TP_PROTO(unsigned int num),
+ TP_ARGS(num),
+ TP_STRUCT__entry(
+ __field( unsigned int, num )
+ ),
+ TP_fast_assign(
+ __entry->num = num;
+ ),
+ TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint, name, \
+ TP_PROTO(unsigned int num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__ull,
+ TP_PROTO(unsigned long long blkno),
+ TP_ARGS(blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull, name, \
+ TP_PROTO(unsigned long long num), \
+ TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+ TP_PROTO(void *pointer),
+ TP_ARGS(pointer),
+ TP_STRUCT__entry(
+ __field(void *, pointer)
+ ),
+ TP_fast_assign(
+ __entry->pointer = pointer;
+ ),
+ TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name) \
+DEFINE_EVENT(ocfs2__pointer, name, \
+ TP_PROTO(void *pointer), \
+ TP_ARGS(pointer))
+
+DECLARE_EVENT_CLASS(ocfs2__string,
+ TP_PROTO(const char *name),
+ TP_ARGS(name),
+ TP_STRUCT__entry(
+ __string(name,name)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ ),
+ TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name) \
+DEFINE_EVENT(ocfs2__string, name, \
+ TP_PROTO(const char *name), \
+ TP_ARGS(name))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+ TP_PROTO(int value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(int, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__int_int, name, \
+ TP_PROTO(int val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+ TP_PROTO(unsigned int value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned int, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_int, name, \
+ TP_PROTO(unsigned int val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+ TP_PROTO(unsigned int value1, unsigned int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint, name, \
+ TP_PROTO(unsigned int val1, unsigned int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+ TP_PROTO(unsigned long long value1, unsigned int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint, name, \
+ TP_PROTO(unsigned long long val1, unsigned int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+ TP_PROTO(unsigned long long value1, int value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(int, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_int, name, \
+ TP_PROTO(unsigned long long val1, int val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+ TP_PROTO(unsigned long long value1, unsigned long long value2),
+ TP_ARGS(value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull, name, \
+ TP_PROTO(unsigned long long val1, unsigned long long val2), \
+ TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+ TP_PROTO(unsigned long long value1,
+ unsigned long long value2, unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %llu %u",
+ __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_uint, name, \
+ TP_PROTO(unsigned long long val1, \
+ unsigned long long val2, unsigned int val3), \
+ TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+ TP_PROTO(unsigned long long value1,
+ unsigned int value2, unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned int, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %u %u", __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint_uint, name, \
+ TP_PROTO(unsigned long long val1, \
+ unsigned int val2, unsigned int val3), \
+ TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+ TP_PROTO(unsigned int value1, unsigned int value2,
+ unsigned int value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field( unsigned int, value1 )
+ __field( unsigned int, value2 )
+ __field( unsigned int, value3 )
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint_uint, name, \
+ TP_PROTO(unsigned int value1, unsigned int value2, \
+ unsigned int value3), \
+ TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+ TP_PROTO(unsigned long long value1,
+ unsigned long long value2, unsigned long long value3),
+ TP_ARGS(value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned long long, value3)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %llu %llu",
+ __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_ull, name, \
+ TP_PROTO(unsigned long long value1, unsigned long long value2, \
+ unsigned long long value3), \
+ TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+ TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+ TP_ARGS(ull, value1, value2, value3),
+ TP_STRUCT__entry(
+ __field( unsigned long long, ull )
+ __field( int, value1 )
+ __field( int, value2 )
+ __field( int, value3 )
+ ),
+ TP_fast_assign(
+ __entry->ull = ull;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %d %d %d",
+ __entry->ull, __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_int_int_int, name, \
+ TP_PROTO(unsigned long long ull, int value1, \
+ int value2, int value3), \
+ TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+ TP_PROTO(unsigned long long ull, unsigned int value1,
+ unsigned int value2, unsigned int value3),
+ TP_ARGS(ull, value1, value2, value3),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ull)
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ __field(unsigned int, value3)
+ ),
+ TP_fast_assign(
+ __entry->ull = ull;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->ull, __entry->value1,
+ __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \
+ TP_PROTO(unsigned long long ull, unsigned int value1, \
+ unsigned int value2, unsigned int value3), \
+ TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+ TP_PROTO(unsigned long long value1, unsigned long long value2,
+ unsigned int value3, unsigned int value4),
+ TP_ARGS(value1, value2, value3, value4),
+ TP_STRUCT__entry(
+ __field(unsigned long long, value1)
+ __field(unsigned long long, value2)
+ __field(unsigned int, value3)
+ __field(unsigned int, value4)
+ ),
+ TP_fast_assign(
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ __entry->value3 = value3;
+ __entry->value4 = value4;
+ ),
+ TP_printk("%llu %llu %u %u",
+ __entry->value1, __entry->value2,
+ __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \
+ TP_PROTO(unsigned long long ull, unsigned long long ull1, \
+ unsigned int value2, unsigned int value3), \
+ TP_ARGS(ull, ull1, value2, value3))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+ TP_PROTO(unsigned long long owner,\
+ unsigned int value1, unsigned int value2),
+ TP_ARGS(owner, value1, value2),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, value1)
+ __field(unsigned int, value2)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->value1 = value1;
+ __entry->value2 = value2;
+ ),
+ TP_printk("%llu %u %u",
+ __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name) \
+DEFINE_EVENT(ocfs2__btree_ops, name, \
+ TP_PROTO(unsigned long long owner, \
+ unsigned int value1, unsigned int value2), \
+ TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+ TP_PROTO(unsigned long long owner, int depth),
+ TP_ARGS(owner, depth),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(int, depth)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->depth = depth;
+ ),
+ TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+ TP_PROTO(int subtree_root, unsigned long long blkno,
+ int depth),
+ TP_ARGS(subtree_root, blkno, depth),
+ TP_STRUCT__entry(
+ __field(int, subtree_root)
+ __field(unsigned long long, blkno)
+ __field(int, depth)
+ ),
+ TP_fast_assign(
+ __entry->subtree_root = subtree_root;
+ __entry->blkno = blkno;
+ __entry->depth = depth;
+ ),
+ TP_printk("%d %llu %d", __entry->subtree_root,
+ __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+ TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+ int ins_contig_index, int free_records, int ins_tree_depth),
+ TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+ ins_tree_depth),
+ TP_STRUCT__entry(
+ __field(unsigned int, ins_appending)
+ __field(unsigned int, ins_contig)
+ __field(int, ins_contig_index)
+ __field(int, free_records)
+ __field(int, ins_tree_depth)
+ ),
+ TP_fast_assign(
+ __entry->ins_appending = ins_appending;
+ __entry->ins_contig = ins_contig;
+ __entry->ins_contig_index = ins_contig_index;
+ __entry->free_records = free_records;
+ __entry->ins_tree_depth = ins_tree_depth;
+ ),
+ TP_printk("%u %u %d %d %d",
+ __entry->ins_appending, __entry->ins_contig,
+ __entry->ins_contig_index, __entry->free_records,
+ __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+ TP_PROTO(int split_index, unsigned int c_contig_type,
+ unsigned int c_has_empty_extent,
+ unsigned int c_split_covers_rec),
+ TP_ARGS(split_index, c_contig_type,
+ c_has_empty_extent, c_split_covers_rec),
+ TP_STRUCT__entry(
+ __field(int, split_index)
+ __field(unsigned int, c_contig_type)
+ __field(unsigned int, c_has_empty_extent)
+ __field(unsigned int, c_split_covers_rec)
+ ),
+ TP_fast_assign(
+ __entry->split_index = split_index;
+ __entry->c_contig_type = c_contig_type;
+ __entry->c_has_empty_extent = c_has_empty_extent;
+ __entry->c_split_covers_rec = c_split_covers_rec;
+ ),
+ TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+ __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+ TP_PROTO(unsigned long long owner, unsigned int cpos,
+ unsigned int len, int index,
+ unsigned int e_cpos, unsigned int clusters),
+ TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(int, index)
+ __field(unsigned int, e_cpos)
+ __field(unsigned int, clusters)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->index = index;
+ __entry->e_cpos = e_cpos;
+ __entry->clusters = clusters;
+ ),
+ TP_printk("%llu %u %u %d %u %u",
+ __entry->owner, __entry->cpos, __entry->len, __entry->index,
+ __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+ TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+ unsigned int clusters, unsigned int depth),
+ TP_ARGS(ino, new_cpos, clusters, depth),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, new_cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, depth)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->new_cpos = new_cpos;
+ __entry->clusters = clusters;
+ __entry->depth = depth;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->ino, __entry->new_cpos,
+ __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+ TP_PROTO(unsigned long long blkno),
+ TP_ARGS(blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+ TP_PROTO(unsigned int insert_cpos, int insert_index,
+ int has_empty, int next_free,
+ unsigned int l_count),
+ TP_ARGS(insert_cpos, insert_index, has_empty,
+ next_free, l_count),
+ TP_STRUCT__entry(
+ __field(unsigned int, insert_cpos)
+ __field(int, insert_index)
+ __field(int, has_empty)
+ __field(int, next_free)
+ __field(unsigned int, l_count)
+ ),
+ TP_fast_assign(
+ __entry->insert_cpos = insert_cpos;
+ __entry->insert_index = insert_index;
+ __entry->has_empty = has_empty;
+ __entry->next_free = next_free;
+ __entry->l_count = l_count;
+ ),
+ TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+ __entry->insert_index, __entry->has_empty,
+ __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+ TP_PROTO(int status, int reason, int err),
+ TP_ARGS(status, reason, err),
+ TP_STRUCT__entry(
+ __field(int, status)
+ __field(int, reason)
+ __field(int, err)
+ ),
+ TP_fast_assign(
+ __entry->status = status;
+ __entry->reason = reason;
+ __entry->err = err;
+ ),
+ TP_printk("%d %d %d", __entry->status,
+ __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+ TP_PROTO(unsigned long long owner, unsigned int cpos,
+ unsigned int len, unsigned int phys),
+ TP_ARGS(owner, cpos, len, phys),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(unsigned int, phys)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->phys = phys;
+ ),
+ TP_printk("%llu %u %u %u",
+ __entry->owner, __entry->cpos,
+ __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+ TP_PROTO(unsigned long long blkno, int index,
+ unsigned int start, unsigned int num),
+ TP_ARGS(blkno, index, start, num),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __field(int, index)
+ __field(unsigned int, start)
+ __field(unsigned int, num)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __entry->index = index;
+ __entry->start = start;
+ __entry->num = num;
+ ),
+ TP_printk("%llu %d %u %u",
+ __entry->blkno, __entry->index,
+ __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \
+DEFINE_EVENT(ocfs2__truncate_log_ops, name, \
+ TP_PROTO(unsigned long long blkno, int index, \
+ unsigned int start, unsigned int num), \
+ TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+ TP_PROTO(int type, int slot, unsigned long long suballoc,
+ unsigned long long blkno, unsigned int bit),
+ TP_ARGS(type, slot, suballoc, blkno, bit),
+ TP_STRUCT__entry(
+ __field(int, type)
+ __field(int, slot)
+ __field(unsigned long long, suballoc)
+ __field(unsigned long long, blkno)
+ __field(unsigned int, bit)
+ ),
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->slot = slot;
+ __entry->suballoc = suballoc;
+ __entry->blkno = blkno;
+ __entry->bit = bit;
+ ),
+ TP_printk("%d %d %llu %llu %u",
+ __entry->type, __entry->slot, __entry->suballoc,
+ __entry->blkno, __entry->bit)
+);
+
+TRACE_EVENT(ocfs2_trim_extent,
+ TP_PROTO(struct super_block *sb, unsigned long long blk,
+ unsigned long long count),
+ TP_ARGS(sb, blk, count),
+ TP_STRUCT__entry(
+ __field(int, dev_major)
+ __field(int, dev_minor)
+ __field(unsigned long long, blk)
+ __field(__u64, count)
+ ),
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(sb->s_dev);
+ __entry->dev_minor = MINOR(sb->s_dev);
+ __entry->blk = blk;
+ __entry->count = count;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->dev_major, __entry->dev_minor,
+ __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
+
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+ TP_PROTO(int count, int bit, unsigned long long start_blk,
+ unsigned long long blkno),
+ TP_ARGS(count, bit, start_blk, blkno),
+ TP_STRUCT__entry(
+ __field(int, count)
+ __field(int, bit)
+ __field(unsigned long long, start_blk)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->count = count;
+ __entry->bit = bit;
+ __entry->start_blk = start_blk;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->count, __entry->bit, __entry->start_blk,
+ __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+ TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+ unsigned long long bg_blkno,
+ unsigned long long prev_blkno),
+ TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, i_blkno)
+ __field(unsigned int, chain)
+ __field(unsigned long long, bg_blkno)
+ __field(unsigned long long, prev_blkno)
+ ),
+ TP_fast_assign(
+ __entry->i_blkno = i_blkno;
+ __entry->chain = chain;
+ __entry->bg_blkno = bg_blkno;
+ __entry->prev_blkno = prev_blkno;
+ ),
+ TP_printk("%llu %u %llu %llu",
+ __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+ __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+ TP_PROTO(unsigned long long inode, unsigned long long group,
+ unsigned int start_bit, unsigned int count),
+ TP_ARGS(inode, group, start_bit, count),
+ TP_STRUCT__entry(
+ __field(unsigned long long, inode)
+ __field(unsigned long long, group)
+ __field(unsigned int, start_bit)
+ __field(unsigned int, count)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->group = group;
+ __entry->start_bit = start_bit;
+ __entry->count = count;
+ ),
+ TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+ __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+ TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+ unsigned int start_bit, unsigned int count),
+ TP_ARGS(bg_blkno, start_blk, start_bit, count),
+ TP_STRUCT__entry(
+ __field(unsigned long long, bg_blkno)
+ __field(unsigned long long, start_blk)
+ __field(unsigned int, start_bit)
+ __field(unsigned int, count)
+ ),
+ TP_fast_assign(
+ __entry->bg_blkno = bg_blkno;
+ __entry->start_blk = start_blk;
+ __entry->start_bit = start_bit;
+ __entry->count = count;
+ ),
+ TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+ __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+ TP_PROTO(unsigned long long blkno, int index,
+ unsigned long long cpos,
+ unsigned int clusters, unsigned int refcount),
+ TP_ARGS(blkno, index, cpos, clusters, refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __field(int, index)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, refcount)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __entry->index = index;
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->refcount = refcount;
+ ),
+ TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+ __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \
+ TP_PROTO(unsigned long long blkno, int index, \
+ unsigned long long cpos, \
+ unsigned int count, unsigned int refcount), \
+ TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+ TP_PROTO(unsigned long long cpos,
+ unsigned int clusters, unsigned int refcount,
+ unsigned long long split_cpos,
+ unsigned int split_clusters, unsigned int split_refcount),
+ TP_ARGS(cpos, clusters, refcount,
+ split_cpos, split_clusters, split_refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned int, refcount)
+ __field(unsigned long long, split_cpos)
+ __field(unsigned int, split_clusters)
+ __field(unsigned int, split_refcount)
+ ),
+ TP_fast_assign(
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->refcount = refcount;
+ __entry->split_cpos = split_cpos;
+ __entry->split_clusters = split_clusters;
+ __entry->split_refcount = split_refcount;
+ ),
+ TP_printk("%llu %u %u %llu %u %u",
+ __entry->cpos, __entry->clusters, __entry->refcount,
+ __entry->split_cpos, __entry->split_clusters,
+ __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+ TP_PROTO(unsigned long long owner,
+ unsigned long long cpos,
+ unsigned int len, int delete),
+ TP_ARGS(owner, cpos, len, delete),
+ TP_STRUCT__entry(
+ __field(unsigned long long, owner)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, len)
+ __field(int, delete)
+ ),
+ TP_fast_assign(
+ __entry->owner = owner;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->delete = delete;
+ ),
+ TP_printk("%llu %llu %u %d",
+ __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+ TP_PROTO(int recs_add, unsigned long long cpos,
+ unsigned int clusters, unsigned long long r_cpos,
+ unsigned int r_clusters, unsigned int refcount, int index),
+ TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+ TP_STRUCT__entry(
+ __field(int, recs_add)
+ __field(unsigned long long, cpos)
+ __field(unsigned int, clusters)
+ __field(unsigned long long, r_cpos)
+ __field(unsigned int, r_clusters)
+ __field(unsigned int, refcount)
+ __field(int, index)
+ ),
+ TP_fast_assign(
+ __entry->recs_add = recs_add;
+ __entry->cpos = cpos;
+ __entry->clusters = clusters;
+ __entry->r_cpos = r_cpos;
+ __entry->r_clusters = r_clusters;
+ __entry->refcount = refcount;
+ __entry->index = index;
+ ),
+ TP_printk("%d %llu %u %llu %u %u %d",
+ __entry->recs_add, __entry->cpos, __entry->clusters,
+ __entry->r_cpos, __entry->r_clusters,
+ __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int len, unsigned int p_cluster,
+ unsigned int ext_flags),
+ TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, len)
+ __field(unsigned int, p_cluster)
+ __field(unsigned int, ext_flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->len = len;
+ __entry->p_cluster = p_cluster;
+ __entry->ext_flags = ext_flags;
+ ),
+ TP_printk("%llu %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->len,
+ __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int old, unsigned int new, unsigned int len,
+ unsigned int ext_flags),
+ TP_ARGS(ino, cpos, old, new, len, ext_flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, old)
+ __field(unsigned int, new)
+ __field(unsigned int, len)
+ __field(unsigned int, ext_flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->old = old;
+ __entry->new = new;
+ __entry->len = len;
+ __entry->ext_flags = ext_flags;
+ ),
+ TP_printk("%llu %u %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->old, __entry->new,
+ __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+ TP_PROTO(unsigned long long ino, unsigned int cpos,
+ unsigned int write_len, unsigned int max_cpos,
+ unsigned int cow_start, unsigned int cow_len),
+ TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, cpos)
+ __field(unsigned int, write_len)
+ __field(unsigned int, max_cpos)
+ __field(unsigned int, cow_start)
+ __field(unsigned int, cow_len)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->cpos = cpos;
+ __entry->write_len = write_len;
+ __entry->max_cpos = max_cpos;
+ __entry->cow_start = cow_start;
+ __entry->cow_len = cow_len;
+ ),
+ TP_printk("%llu %u %u %u %u %u",
+ __entry->ino, __entry->cpos, __entry->write_len,
+ __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+ TP_PROTO(unsigned long long ino, unsigned long long iblock,
+ void *bh_result, int create),
+ TP_ARGS(ino, iblock, bh_result, create),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, iblock)
+ __field(void *, bh_result)
+ __field(int, create)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->iblock = iblock;
+ __entry->bh_result = bh_result;
+ __entry->create = create;
+ ),
+ TP_printk("%llu %llu %p %d",
+ __entry->ino, __entry->iblock,
+ __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \
+DEFINE_EVENT(ocfs2__get_block, name, \
+ TP_PROTO(unsigned long long ino, unsigned long long iblock, \
+ void *bh_result, int create), \
+ TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+ TP_PROTO(unsigned long long ino, unsigned int len,
+ unsigned long long pos, unsigned int flags),
+ TP_ARGS(ino, len, pos, flags),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, len)
+ __field(unsigned long long, pos)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->len = len;
+ __entry->pos = pos;
+ __entry->flags = flags;
+ ),
+ TP_printk("%llu %u %llu 0x%x",
+ __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+ TP_PROTO(unsigned long long ino,
+ long long i_size, unsigned int i_clusters,
+ unsigned long long pos, unsigned int len,
+ unsigned int flags, void *page,
+ unsigned int clusters, unsigned int extents_to_split),
+ TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+ page, clusters, extents_to_split),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(long long, i_size)
+ __field(unsigned int, i_clusters)
+ __field(unsigned long long, pos)
+ __field(unsigned int, len)
+ __field(unsigned int, flags)
+ __field(void *, page)
+ __field(unsigned int, clusters)
+ __field(unsigned int, extents_to_split)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->i_size = i_size;
+ __entry->i_clusters = i_clusters;
+ __entry->pos = pos;
+ __entry->len = len;
+ __entry->flags = flags;
+ __entry->page = page;
+ __entry->clusters = clusters;
+ __entry->extents_to_split = extents_to_split;
+ ),
+ TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+ __entry->ino, __entry->i_size, __entry->i_clusters,
+ __entry->pos, __entry->len,
+ __entry->flags, __entry->page, __entry->clusters,
+ __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+ TP_PROTO(unsigned long long ino,
+ unsigned long long pos, unsigned int copied,
+ unsigned int id_count, unsigned int features),
+ TP_ARGS(ino, pos, copied, id_count, features),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, pos)
+ __field(unsigned int, copied)
+ __field(unsigned int, id_count)
+ __field(unsigned int, features)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->pos = pos;
+ __entry->copied = copied;
+ __entry->id_count = id_count;
+ __entry->features = features;
+ ),
+ TP_printk("%llu %llu %u %u %u",
+ __entry->ino, __entry->pos, __entry->copied,
+ __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+ TP_PROTO(unsigned long long ino,
+ void *area, void *page, unsigned long pgoff),
+ TP_ARGS(ino, area, page, pgoff),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(void *, area)
+ __field(void *, page)
+ __field(unsigned long, pgoff)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->area = area;
+ __entry->page = page;
+ __entry->pgoff = pgoff;
+ ),
+ TP_printk("%llu %p %p %lu",
+ __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+ TP_PROTO(void *inode, void *file, void *dentry,
+ unsigned long long ino,
+ unsigned int d_len, const unsigned char *d_name,
+ unsigned long long para),
+ TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(void *, file)
+ __field(void *, dentry)
+ __field(unsigned long long, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, d_name)
+ __field(unsigned long long, para)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->file = file;
+ __entry->dentry = dentry;
+ __entry->ino = ino;
+ __entry->d_len = d_len;
+ __assign_str(d_name, d_name);
+ __entry->para = para;
+ ),
+ TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+ __entry->dentry, __entry->ino, __entry->para,
+ __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name) \
+DEFINE_EVENT(ocfs2__file_ops, name, \
+TP_PROTO(void *inode, void *file, void *dentry, \
+ unsigned long long ino, \
+ unsigned int d_len, const unsigned char *d_name, \
+ unsigned long long mode), \
+ TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+ TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+ unsigned int clusters, unsigned int clusters_to_add,
+ int why, int restart_func),
+ TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ip_blkno)
+ __field(unsigned long long, size)
+ __field(unsigned int, clusters)
+ __field(unsigned int, clusters_to_add)
+ __field(int, why)
+ __field(int, restart_func)
+ ),
+ TP_fast_assign(
+ __entry->ip_blkno = ip_blkno;
+ __entry->size = size;
+ __entry->clusters = clusters;
+ __entry->clusters_to_add = clusters_to_add;
+ __entry->why = why;
+ __entry->restart_func = restart_func;
+ ),
+ TP_printk("%llu %llu %u %u %d %d",
+ __entry->ip_blkno, __entry->size, __entry->clusters,
+ __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+ TP_PROTO(unsigned long long ino,
+ unsigned int di_clusters, unsigned long long di_size,
+ unsigned int ip_clusters, unsigned long long i_size),
+ TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, di_clusters)
+ __field(unsigned long long, di_size)
+ __field(unsigned int, ip_clusters)
+ __field(unsigned long long, i_size)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->di_clusters = di_clusters;
+ __entry->di_size = di_size;
+ __entry->ip_clusters = ip_clusters;
+ __entry->i_size = i_size;
+ ),
+ TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+ __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+ TP_PROTO(unsigned long long ino,
+ unsigned long long abs_from, unsigned long long abs_to,
+ unsigned long index, unsigned int zero_from,
+ unsigned int zero_to),
+ TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, abs_from)
+ __field(unsigned long long, abs_to)
+ __field(unsigned long, index)
+ __field(unsigned int, zero_from)
+ __field(unsigned int, zero_to)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->abs_from = abs_from;
+ __entry->abs_to = abs_to;
+ __entry->index = index;
+ __entry->zero_from = zero_from;
+ __entry->zero_to = zero_to;
+ ),
+ TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+ __entry->abs_from, __entry->abs_to,
+ __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+ TP_PROTO(void *inode, void *dentry,
+ unsigned long long ino,
+ unsigned int d_len, const unsigned char *d_name,
+ unsigned int ia_valid, unsigned int ia_mode,
+ unsigned int ia_uid, unsigned int ia_gid),
+ TP_ARGS(inode, dentry, ino, d_len, d_name,
+ ia_valid, ia_mode, ia_uid, ia_gid),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(void *, dentry)
+ __field(unsigned long long, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, d_name)
+ __field(unsigned int, ia_valid)
+ __field(unsigned int, ia_mode)
+ __field(unsigned int, ia_uid)
+ __field(unsigned int, ia_gid)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->dentry = dentry;
+ __entry->ino = ino;
+ __entry->d_len = d_len;
+ __assign_str(d_name, d_name);
+ __entry->ia_valid = ia_valid;
+ __entry->ia_mode = ia_mode;
+ __entry->ia_uid = ia_uid;
+ __entry->ia_gid = ia_gid;
+ ),
+ TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+ __entry->dentry, __entry->ino, __entry->d_len,
+ __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+ __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+ TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+ int appending, unsigned long count,
+ int *direct_io, int *has_refcount),
+ TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned long long, saved_pos)
+ __field(int, appending)
+ __field(unsigned long, count)
+ __field(int, direct_io)
+ __field(int, has_refcount)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->saved_pos = saved_pos;
+ __entry->appending = appending;
+ __entry->count = count;
+ __entry->direct_io = direct_io ? *direct_io : -1;
+ __entry->has_refcount = has_refcount ? *has_refcount : -1;
+ ),
+ TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+ __entry->saved_pos, __entry->appending, __entry->count,
+ __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+ TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+ TP_ARGS(ino, flags, sysfile_type),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ __field(int, sysfile_type)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->flags = flags;
+ __entry->sysfile_type = sysfile_type;
+ ),
+ TP_printk("%llu %u %d", __entry->ino,
+ __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+ TP_PROTO(void *inode, unsigned long long ino),
+ TP_ARGS(inode, ino),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ ),
+ TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+ TP_PROTO(void *inode, unsigned long long ino,
+ void *args, unsigned long long fi_blkno),
+ TP_ARGS(inode, ino, args, fi_blkno),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ __field(void *, args)
+ __field(unsigned long long, fi_blkno)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->args = args;
+ __entry->fi_blkno = fi_blkno;
+ ),
+ TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+ __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+ TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+ unsigned int flags),
+ TP_ARGS(task, dc_task, ino, flags),
+ TP_STRUCT__entry(
+ __field(void *, task)
+ __field(void *, dc_task)
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->task = task;
+ __entry->dc_task = dc_task;
+ __entry->ino = ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+ __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+ TP_PROTO(void *inode, unsigned long long ino,
+ unsigned int flags),
+ TP_ARGS(inode, ino, flags),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->ino = ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+ TP_PROTO(void *inode, unsigned long long vblock, int nr,
+ void *bhs, unsigned int flags, void *validate),
+ TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+ TP_STRUCT__entry(
+ __field(void *, inode)
+ __field(unsigned long long, vblock)
+ __field(int, nr)
+ __field(void *, bhs)
+ __field(unsigned int, flags)
+ __field(void *, validate)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->vblock = vblock;
+ __entry->nr = nr;
+ __entry->bhs = bhs;
+ __entry->flags = flags;
+ __entry->validate = validate;
+ ),
+ TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+ __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+ TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+ TP_ARGS(s_flags, osb_flags, flags),
+ TP_STRUCT__entry(
+ __field(unsigned long, s_flags)
+ __field(unsigned long, osb_flags)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->s_flags = s_flags;
+ __entry->osb_flags = osb_flags;
+ __entry->flags = flags;
+ ),
+ TP_printk("%lu %lu %d", __entry->s_flags,
+ __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+ TP_PROTO(void *sb, void *data, int silent),
+ TP_ARGS(sb, data, silent),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, data)
+ __field(int, silent)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->data = data;
+ __entry->silent = silent;
+ ),
+ TP_printk("%p %p %d", __entry->sb,
+ __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+ TP_PROTO(int is_remount, char *options),
+ TP_ARGS(is_remount, options),
+ TP_STRUCT__entry(
+ __field(int, is_remount)
+ __string(options, options)
+ ),
+ TP_fast_assign(
+ __entry->is_remount = is_remount;
+ __assign_str(options, options);
+ ),
+ TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+ TP_PROTO(void *sb, void *buf),
+ TP_ARGS(sb, buf),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, buf)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->buf = buf;
+ ),
+ TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+ TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+ unsigned long long system_dir, int cluster_bits),
+ TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+ TP_STRUCT__entry(
+ __string(label, label)
+ __string(uuid_str, uuid_str)
+ __field(unsigned long long, root_dir)
+ __field(unsigned long long, system_dir)
+ __field(int, cluster_bits)
+ ),
+ TP_fast_assign(
+ __assign_str(label, label);
+ __assign_str(uuid_str, uuid_str);
+ __entry->root_dir = root_dir;
+ __entry->system_dir = system_dir;
+ __entry->cluster_bits = cluster_bits;
+ ),
+ TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+ __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+ TP_PROTO(const char *name, int meta, int clusters, int credits),
+ TP_ARGS(name, meta, clusters, credits),
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(int, meta)
+ __field(int, clusters)
+ __field(int, credits)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->meta = meta;
+ __entry->clusters = clusters;
+ __entry->credits = credits;
+ ),
+ TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+ __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+ TP_PROTO(unsigned long long ino, const char *name, int name_index,
+ unsigned int hash, unsigned long long location,
+ int xe_index),
+ TP_ARGS(ino, name, name_index, hash, location, xe_index),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __string(name, name)
+ __field(int, name_index)
+ __field(unsigned int, hash)
+ __field(unsigned long long, location)
+ __field(int, xe_index)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __assign_str(name, name);
+ __entry->name_index = name_index;
+ __entry->hash = hash;
+ __entry->location = location;
+ __entry->xe_index = xe_index;
+ ),
+ TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+ __entry->name_index, __entry->hash, __entry->location,
+ __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \
+DEFINE_EVENT(ocfs2__xattr_find, name, \
+TP_PROTO(unsigned long long ino, const char *name, int name_index, \
+ unsigned int hash, unsigned long long bucket, \
+ int xe_index), \
+ TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+ TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+ unsigned int wanted, int empty_root),
+ TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+ TP_STRUCT__entry(
+ __field(unsigned int, r_start)
+ __field(unsigned int, r_end)
+ __field(unsigned int, goal)
+ __field(unsigned int, wanted)
+ __field(int, empty_root)
+ ),
+ TP_fast_assign(
+ __entry->r_start = r_start;
+ __entry->r_end = r_end;
+ __entry->goal = goal;
+ __entry->wanted = wanted;
+ __entry->empty_root = empty_root;
+ ),
+ TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+ __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+ TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(start, end, len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, start)
+ __field(unsigned int, end)
+ __field(unsigned int, len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->len = len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+ __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+ TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+ unsigned int r_start, unsigned int r_end, unsigned int r_len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(cstart, cend, clen, r_start, r_end,
+ r_len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, cstart)
+ __field(unsigned int, cend)
+ __field(unsigned int, clen)
+ __field(unsigned int, r_start)
+ __field(unsigned int, r_end)
+ __field(unsigned int, r_len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->cstart = cstart;
+ __entry->cend = cend;
+ __entry->clen = clen;
+ __entry->r_start = r_start;
+ __entry->r_end = r_end;
+ __entry->r_len = r_len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u %u %u %u",
+ __entry->cstart, __entry->cend, __entry->clen,
+ __entry->r_start, __entry->r_end, __entry->r_len,
+ __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+ TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+ unsigned int last_start, unsigned int last_len),
+ TP_ARGS(start, end, len, last_start, last_len),
+ TP_STRUCT__entry(
+ __field(unsigned int, start)
+ __field(unsigned int, end)
+ __field(unsigned int, len)
+ __field(unsigned int, last_start)
+ __field(unsigned int, last_len)
+ ),
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->len = len;
+ __entry->last_start = last_start;
+ __entry->last_len = last_len;
+ ),
+ TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+ __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+ TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+ long long spacechange, long long curinodes,
+ long long inodechange),
+ TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+ TP_STRUCT__entry(
+ __field(unsigned int, dq_id)
+ __field(long long, dqb_curspace)
+ __field(long long, spacechange)
+ __field(long long, curinodes)
+ __field(long long, inodechange)
+ ),
+ TP_fast_assign(
+ __entry->dq_id = dq_id;
+ __entry->dqb_curspace = dqb_curspace;
+ __entry->spacechange = spacechange;
+ __entry->curinodes = curinodes;
+ __entry->inodechange = inodechange;
+ ),
+ TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+ __entry->dqb_curspace, __entry->spacechange,
+ __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+ TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+ const char *s_id),
+ TP_ARGS(dq_id, dq_type, type, s_id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, dq_id)
+ __field(unsigned int, dq_type)
+ __field(unsigned long, type)
+ __string(s_id, s_id)
+ ),
+ TP_fast_assign(
+ __entry->dq_id = dq_id;
+ __entry->dq_type = dq_type;
+ __entry->type = type;
+ __assign_str(s_id, s_id);
+ ),
+ TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+ __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+ TP_PROTO(unsigned long long ino, int namelen, const char *name,
+ unsigned int major_hash, unsigned int minor_hash,
+ unsigned long long blkno),
+ TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(int, namelen)
+ __string(name, name)
+ __field(unsigned int, major_hash)
+ __field(unsigned int,minor_hash)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->major_hash = major_hash;
+ __entry->minor_hash = minor_hash;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+ __entry->namelen, __get_str(name),
+ __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+ TP_PROTO(int namelen, const char *name, void *blkno,
+ unsigned long long dir),
+ TP_ARGS(namelen, name, blkno, dir),
+ TP_STRUCT__entry(
+ __field(int, namelen)
+ __string(name, name)
+ __field(void *, blkno)
+ __field(unsigned long long, dir)
+ ),
+ TP_fast_assign(
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->blkno = blkno;
+ __entry->dir = dir;
+ ),
+ TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+ __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+ TP_PROTO(unsigned long long dir, int namelen, const char *name),
+ TP_ARGS(dir, namelen, name),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __field(int, namelen)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ ),
+ TP_printk("%llu %.*s", __entry->dir,
+ __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+ TP_PROTO(unsigned long long dir,
+ unsigned int major_hash, unsigned int minor_hash,
+ int namelen, const char *name, unsigned int num_used),
+ TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __field(unsigned int, major_hash)
+ __field(unsigned int, minor_hash)
+ __field(int, namelen)
+ __string(name, name)
+ __field(unsigned int, num_used)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->major_hash = major_hash;
+ __entry->minor_hash = minor_hash;
+ __entry->namelen = namelen;
+ __assign_str(name, name);
+ __entry->num_used = num_used;
+ ),
+ TP_printk("%llu %x %x %.*s %u", __entry->dir,
+ __entry->major_hash, __entry->minor_hash,
+ __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+ TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+ unsigned long long dir_blkno, unsigned long long extra),
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(unsigned long long, dir_blkno)
+ __field(unsigned long long, extra)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->dir_blkno = dir_blkno;
+ __entry->extra = extra;
+ ),
+ TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+ __entry->name_len, __get_str(name),
+ __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name) \
+DEFINE_EVENT(ocfs2__dentry_ops, name, \
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \
+ unsigned long long dir_blkno, unsigned long long extra), \
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+ TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+ unsigned long long dir_blkno, unsigned long dev, int mode),
+ TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(unsigned long long, dir_blkno)
+ __field(unsigned long, dev)
+ __field(int, mode)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->dir_blkno = dir_blkno;
+ __entry->dev = dev;
+ __entry->mode = mode;
+ ),
+ TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+ __entry->name_len, __get_str(name),
+ __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+ TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+ int name_len, const char *name),
+ TP_ARGS(ino, old_len, old_name, name_len, name),
+ TP_STRUCT__entry(
+ __field(unsigned long long, ino)
+ __field(int, old_len)
+ __string(old_name, old_name)
+ __field(int, name_len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->ino = ino;
+ __entry->old_len = old_len;
+ __assign_str(old_name, old_name);
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%llu %.*s %.*s", __entry->ino,
+ __entry->old_len, __get_str(old_name),
+ __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+ TP_PROTO(void *old_dir, void *old_dentry,
+ void *new_dir, void *new_dentry,
+ int old_len, const char *old_name,
+ int new_len, const char *new_name),
+ TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+ old_len, old_name, new_len, new_name),
+ TP_STRUCT__entry(
+ __field(void *, old_dir)
+ __field(void *, old_dentry)
+ __field(void *, new_dir)
+ __field(void *, new_dentry)
+ __field(int, old_len)
+ __string(old_name, old_name)
+ __field(int, new_len)
+ __string(new_name, new_name)
+ ),
+ TP_fast_assign(
+ __entry->old_dir = old_dir;
+ __entry->old_dentry = old_dentry;
+ __entry->new_dir = new_dir;
+ __entry->new_dentry = new_dentry;
+ __entry->old_len = old_len;
+ __assign_str(old_name, old_name);
+ __entry->new_len = new_len;
+ __assign_str(new_name, new_name);
+ ),
+ TP_printk("%p %p %p %p %.*s %.*s",
+ __entry->old_dir, __entry->old_dentry,
+ __entry->new_dir, __entry->new_dentry,
+ __entry->old_len, __get_str(old_name),
+ __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+ TP_PROTO(int new_len, const char *new_name),
+ TP_ARGS(new_len, new_name),
+ TP_STRUCT__entry(
+ __field(int, new_len)
+ __string(new_name, new_name)
+ ),
+ TP_fast_assign(
+ __entry->new_len = new_len;
+ __assign_str(new_name, new_name);
+ ),
+ TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+ TP_PROTO(unsigned long long new_blkno, void *new_bh,
+ unsigned long long newdi_blkno),
+ TP_ARGS(new_blkno, new_bh, newdi_blkno),
+ TP_STRUCT__entry(
+ __field(unsigned long long, new_blkno)
+ __field(void *, new_bh)
+ __field(unsigned long long, newdi_blkno)
+ ),
+ TP_fast_assign(
+ __entry->new_blkno = new_blkno;
+ __entry->new_bh = new_bh;
+ __entry->newdi_blkno = newdi_blkno;
+ ),
+ TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+ __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+ TP_PROTO(void *dir, void *dentry, const char *symname,
+ int len, const char *name),
+ TP_ARGS(dir, dentry, symname, len, name),
+ TP_STRUCT__entry(
+ __field(void *, dir)
+ __field(void *, dentry)
+ __field(const char *, symname)
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __entry->dentry = dentry;
+ __entry->symname = symname;
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+ __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+ TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+ TP_ARGS(blkno, name, namelen),
+ TP_STRUCT__entry(
+ __field(unsigned long long, blkno)
+ __string(name, name)
+ __field(int, namelen)
+ ),
+ TP_fast_assign(
+ __entry->blkno = blkno;
+ __assign_str(name, name);
+ __entry->namelen = namelen;
+ ),
+ TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+ __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+ TP_PROTO(unsigned long long dir, const char *name, int namelen),
+ TP_ARGS(dir, name, namelen),
+ TP_STRUCT__entry(
+ __field(unsigned long long, dir)
+ __string(name, name)
+ __field(int, namelen)
+ ),
+ TP_fast_assign(
+ __entry->dir = dir;
+ __assign_str(name, name);
+ __entry->namelen = namelen;
+ ),
+ TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+ __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+ TP_PROTO(void *dentry, int len, const char *name),
+ TP_ARGS(dentry, len, name),
+ TP_STRUCT__entry(
+ __field(void *, dentry)
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->dentry = dentry;
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+ TP_PROTO(int len, const char *name, unsigned long pgen,
+ unsigned long gen),
+ TP_ARGS(len, name, pgen, gen),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long, pgen)
+ __field(unsigned long, gen)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->pgen = pgen;
+ __entry->gen = gen;
+ ),
+ TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+ __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+ TP_PROTO(int len, const char *name),
+ TP_ARGS(len, name),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ ),
+ TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+ TP_PROTO(int len, const char *name,
+ unsigned long long parent, void *fsdata),
+ TP_ARGS(len, name, parent, fsdata),
+ TP_STRUCT__entry(
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long long, parent)
+ __field(void *, fsdata)
+ ),
+ TP_fast_assign(
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->parent = parent;
+ __entry->fsdata = fsdata;
+ ),
+ TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+ __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+ TP_PROTO(const char *name, unsigned long long parent,
+ unsigned long long ino),
+ TP_ARGS(name, parent, ino),
+ TP_STRUCT__entry(
+ __string(name, name)
+ __field(unsigned long long, parent)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __assign_str(name, name);
+ __entry->parent = parent;
+ __entry->ino = ino;
+ ),
+ TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+ TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+ TP_ARGS(sb, handle, blkno),
+ TP_STRUCT__entry(
+ __field(void *, sb)
+ __field(void *, handle)
+ __field(unsigned long long, blkno)
+ ),
+ TP_fast_assign(
+ __entry->sb = sb;
+ __entry->handle = handle;
+ __entry->blkno = blkno;
+ ),
+ TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+ TP_PROTO(void *child, int len, const char *name,
+ unsigned long long ino),
+ TP_ARGS(child, len, name, ino),
+ TP_STRUCT__entry(
+ __field(void *, child)
+ __field(int, len)
+ __string(name, name)
+ __field(unsigned long long, ino)
+ ),
+ TP_fast_assign(
+ __entry->child = child;
+ __entry->len = len;
+ __assign_str(name, name);
+ __entry->ino = ino;
+ ),
+ TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+ __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+ TP_PROTO(void *dentry, int name_len, const char *name,
+ void *fh, int len, int connectable),
+ TP_ARGS(dentry, name_len, name, fh, len, connectable),
+ TP_STRUCT__entry(
+ __field(void *, dentry)
+ __field(int, name_len)
+ __string(name, name)
+ __field(void *, fh)
+ __field(int, len)
+ __field(int, connectable)
+ ),
+ TP_fast_assign(
+ __entry->dentry = dentry;
+ __entry->name_len = name_len;
+ __assign_str(name, name);
+ __entry->fh = fh;
+ __entry->len = len;
+ __entry->connectable = connectable;
+ ),
+ TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+ __get_str(name), __entry->fh, __entry->len,
+ __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+ TP_PROTO(int slot, unsigned long long la_ino,
+ unsigned long long tl_ino, void *qrec),
+ TP_ARGS(slot, la_ino, tl_ino, qrec),
+ TP_STRUCT__entry(
+ __field(int, slot)
+ __field(unsigned long long, la_ino)
+ __field(unsigned long long, tl_ino)
+ __field(void *, qrec)
+ ),
+ TP_fast_assign(
+ __entry->slot = slot;
+ __entry->la_ino = la_ino;
+ __entry->tl_ino = tl_ino;
+ __entry->qrec = qrec;
+ ),
+ TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+ __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+ TP_PROTO(int node_num, int osb_node_num, int disable,
+ void *recovery_thread, int map_set),
+ TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+ TP_STRUCT__entry(
+ __field(int, node_num)
+ __field(int, osb_node_num)
+ __field(int,disable)
+ __field(void *, recovery_thread)
+ __field(int,map_set)
+ ),
+ TP_fast_assign(
+ __entry->node_num = node_num;
+ __entry->osb_node_num = osb_node_num;
+ __entry->disable = disable;
+ __entry->recovery_thread = recovery_thread;
+ __entry->map_set = map_set;
+ ),
+ TP_printk("%d %d %d %p %d", __entry->node_num,
+ __entry->osb_node_num, __entry->disable,
+ __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+ TP_PROTO(unsigned long long block, void *ci),
+ TP_ARGS(block, ci),
+ TP_STRUCT__entry(
+ __field(unsigned long long, block)
+ __field(void *, ci)
+ ),
+ TP_fast_assign(
+ __entry->block = block;
+ __entry->ci = ci;
+ ),
+ TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+ TP_PROTO(void *ci, unsigned long long block,
+ unsigned int nr, int flags),
+ TP_ARGS(ci, block, nr, flags),
+ TP_STRUCT__entry(
+ __field(void *, ci)
+ __field(unsigned long long, block)
+ __field(unsigned int, nr)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->ci = ci;
+ __entry->block = block;
+ __entry->nr = nr;
+ __entry->flags = flags;
+ ),
+ TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+ __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+ TP_PROTO(int index, void *item),
+ TP_ARGS(index, item),
+ TP_STRUCT__entry(
+ __field(int, index)
+ __field(void *, item)
+ ),
+ TP_fast_assign(
+ __entry->index = index;
+ __entry->item = item;
+ ),
+ TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c..f266d67df3c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,10 +23,12 @@
struct ocfs2_dquot {
struct dquot dq_dquot; /* Generic VFS dquot */
loff_t dq_local_off; /* Offset in the local quota file */
+ u64 dq_local_phys_blk; /* Physical block carrying quota structure */
struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
s64 dq_origspace; /* Last globally synced space usage */
s64 dq_originodes; /* Last globally synced inode usage */
+ struct llist_node list; /* Member of list of dquots to drop */
};
/* Description of one chunk to recover in memory */
@@ -51,8 +53,9 @@ struct ocfs2_mem_dqinfo {
struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
+ u64 dqi_giblk; /* Number of block with global information header */
struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
- struct buffer_head *dqi_ibh; /* Buffer with information header */
+ struct buffer_head *dqi_libh; /* Buffer with local information header */
struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
struct delayed_work dqi_sync_work; /* Work for syncing dquots */
struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
@@ -102,13 +105,15 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
- struct buffer_head **bh);
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+ struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
-
#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d52..b990a62cff5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,14 +3,15 @@
*/
#include <linux/spinlock.h>
#include <linux/fs.h>
+#include <linux/slab.h>
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/dqblk_qtree.h>
#include <linux/jiffies.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
+#include <linux/llist.h>
-#define MLOG_MASK_PREFIX ML_QUOTA
#include <cluster/masklog.h>
#include "ocfs2_fs.h"
@@ -24,9 +25,44 @@
#include "dlmglue.h"
#include "uptodate.h"
#include "super.h"
+#include "buffer_head_io.h"
#include "quota.h"
+#include "ocfs2_trace.h"
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ * by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * and ip_alloc_sem of the global quota file (achieved by
+ * ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ * its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ * start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * write to gf
+ * -> write to lf
+ * Acquire dquot for the first time:
+ * dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ * -> alloc space for gf
+ * -> start_trans -> qinfo_lock -> write to gf
+ * -> ip_alloc_sem of lf -> alloc space for lf
+ * -> write to lf
+ * Release last reference to dquot:
+ * dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ * -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ * inode cluster lock of recovered lf
+ * -> read bitmaps -> ip_alloc_sem of lf
+ * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * write to gf
+ */
static void qsync_work_fn(struct work_struct *work);
@@ -60,7 +96,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
struct ocfs2_global_disk_dqblk *d = dp;
struct mem_dqblk *m = &dquot->dq_dqb;
- d->dqb_id = cpu_to_le32(dquot->dq_id);
+ d->dqb_id = cpu_to_le32(from_kqid(&init_user_ns, dquot->dq_id));
d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
@@ -77,11 +113,14 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
{
struct ocfs2_global_disk_dqblk *d = dp;
struct ocfs2_mem_dqinfo *oinfo =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
if (qtree_entry_unused(&oinfo->dqi_gi, dp))
return 0;
- return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+
+ return qid_eq(make_kqid(&init_user_ns, dquot->dq_id.type,
+ le32_to_cpu(d->dqb_id)),
+ dquot->dq_id);
}
struct qtree_fmt_operations ocfs2_global_ops = {
@@ -90,14 +129,12 @@ struct qtree_fmt_operations ocfs2_global_ops = {
.is_id = ocfs2_global_is_id,
};
-static int ocfs2_validate_quota_block(struct super_block *sb,
- struct buffer_head *bh)
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
{
struct ocfs2_disk_dqtrailer *dqt =
ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
- mlog(0, "Validating quota block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -109,54 +146,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
}
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
- struct buffer_head **bh)
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+ struct buffer_head **bhp)
{
- int rc = 0;
- struct buffer_head *tmp = *bh;
-
- if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
- ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_block,
- (unsigned long long)i_size_read(inode));
- return -EIO;
- }
- rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
- ocfs2_validate_quota_block);
+ int rc;
+
+ *bhp = NULL;
+ rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
+ ocfs2_validate_quota_block);
if (rc)
mlog_errno(rc);
-
- /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
- if (!rc && !*bh)
- *bh = tmp;
-
return rc;
}
-static int ocfs2_get_quota_block(struct inode *inode, int block,
- struct buffer_head **bh)
-{
- u64 pblock, pcount;
- int err;
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (err) {
- mlog_errno(err);
- return err;
- }
- *bh = sb_getblk(inode->i_sb, pblock);
- if (!*bh) {
- err = -EIO;
- mlog_errno(err);
- }
- return err;
-}
-
/* Read data from global quotafile - avoid pagecache and such because we cannot
* afford acquiring the locks... We use quota cluster lock to serialize
* operations. Caller is responsible for acquiring it. */
@@ -171,6 +173,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
int err = 0;
struct buffer_head *bh;
size_t toread, tocopy;
+ u64 pblock = 0, pcount = 0;
if (off > i_size)
return 0;
@@ -179,8 +182,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
toread = len;
while (toread > 0) {
tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+ if (!pcount) {
+ err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+ &pcount, NULL);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ } else {
+ pcount--;
+ pblock++;
+ }
bh = NULL;
- err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
if (err) {
mlog_errno(err);
return err;
@@ -208,6 +222,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
int err = 0, new = 0, ja_type;
struct buffer_head *bh = NULL;
handle_t *handle = journal_current_handle();
+ u64 pblock, pcount;
if (!handle) {
mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -220,12 +235,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
}
- mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
- if (gqinode->i_size < off + len) {
+ if (i_size_read(gqinode) < off + len) {
loff_t rounded_end =
ocfs2_align_bytes_to_blocks(sb, off + len);
- /* Space is already allocated in ocfs2_global_read_dquot() */
+ /* Space is already allocated in ocfs2_acquire_dquot() */
err = ocfs2_simple_size_update(gqinode,
oinfo->dqi_gqi_bh,
rounded_end);
@@ -233,13 +247,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
goto out;
new = 1;
}
+ err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+ if (err) {
+ mlog_errno(err);
+ goto out;
+ }
/* Not rewriting whole block? */
if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
!new) {
- err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
} else {
- err = ocfs2_get_quota_block(gqinode, blk, &bh);
+ bh = sb_getblk(sb, pblock);
+ if (!bh)
+ err = -ENOMEM;
ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
}
if (err) {
@@ -260,19 +281,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
brelse(bh);
goto out;
}
- err = ocfs2_journal_dirty(handle, bh);
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
- if (err < 0)
- goto out;
out:
if (err) {
- mutex_unlock(&gqinode->i_mutex);
mlog_errno(err);
return err;
}
gqinode->i_version++;
ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
- mutex_unlock(&gqinode->i_mutex);
return len;
}
@@ -290,11 +307,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
else
WARN_ON(bh != oinfo->dqi_gqi_bh);
spin_unlock(&dq_data_lock);
+ if (ex) {
+ mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+ down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ } else {
+ down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ }
return 0;
}
void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
+ if (ex) {
+ up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+ } else {
+ up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+ }
ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
brelse(oinfo->dqi_gqi_bh);
spin_lock(&dq_data_lock);
@@ -312,10 +341,9 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
struct ocfs2_global_disk_dqinfo dinfo;
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ u64 pcount;
int status;
- mlog_entry_void();
-
/* Read global header */
gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
OCFS2_INVALID_SLOT);
@@ -338,9 +366,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
mlog_errno(status);
goto out_err;
}
+
+ status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ &pcount, NULL);
+ if (status < 0)
+ goto out_unlock;
+
+ status = ocfs2_qinfo_lock(oinfo, 0);
+ if (status < 0)
+ goto out_unlock;
status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
sizeof(struct ocfs2_global_disk_dqinfo),
OCFS2_GLOBAL_INFO_OFF);
+ ocfs2_qinfo_unlock(oinfo, 0);
ocfs2_unlock_global_qf(oinfo, 0);
if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -361,12 +399,15 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
OCFS2_QBLK_RESERVED_SPACE;
oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
- queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
- msecs_to_jiffies(oinfo->dqi_syncms));
+ schedule_delayed_work(&oinfo->dqi_sync_work,
+ msecs_to_jiffies(oinfo->dqi_syncms));
out_err:
- mlog_exit(status);
return status;
+out_unlock:
+ ocfs2_unlock_global_qf(oinfo, 0);
+ mlog_errno(status);
+ goto out_err;
}
/* Write information to global quota file. Expects exlusive lock on quota
@@ -425,78 +466,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
{
- /* We modify all the allocated blocks, tree root, and info block */
+ /* We modify all the allocated blocks, tree root, info block and
+ * the inode */
return (ocfs2_global_qinit_alloc(sb, type) + 2) *
- OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
-}
-
-/* Read in information from global quota file and acquire a reference to it.
- * dquot_acquire() has already started the transaction and locked quota file */
-int ocfs2_global_read_dquot(struct dquot *dquot)
-{
- int err, err2, ex = 0;
- struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
- struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
- struct ocfs2_super *osb = OCFS2_SB(sb);
- struct inode *gqinode = info->dqi_gqinode;
- int need_alloc = ocfs2_global_qinit_alloc(sb, type);
- handle_t *handle = NULL;
-
- err = ocfs2_qinfo_lock(info, 0);
- if (err < 0)
- goto out;
- err = qtree_read_dquot(&info->dqi_gi, dquot);
- if (err < 0)
- goto out_qlock;
- OCFS2_DQUOT(dquot)->dq_use_count++;
- OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
- OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
- ocfs2_qinfo_unlock(info, 0);
-
- if (!dquot->dq_off) { /* No real quota entry? */
- ex = 1;
- /*
- * Add blocks to quota file before we start a transaction since
- * locking allocators ranks above a transaction start
- */
- WARN_ON(journal_current_handle());
- down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
- err = ocfs2_extend_no_holes(gqinode,
- gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
- gqinode->i_size);
- up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
- if (err < 0)
- goto out;
- }
-
- handle = ocfs2_start_trans(osb,
- ocfs2_calc_global_qinit_credits(sb, type));
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto out;
- }
- err = ocfs2_qinfo_lock(info, ex);
- if (err < 0)
- goto out_trans;
- err = qtree_write_dquot(&info->dqi_gi, dquot);
- if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
- err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
- if (!err)
- err = err2;
- }
-out_qlock:
- if (ex)
- ocfs2_qinfo_unlock(info, 1);
- else
- ocfs2_qinfo_unlock(info, 0);
-out_trans:
- if (handle)
- ocfs2_commit_trans(osb, handle);
-out:
- if (err < 0)
- mlog_errno(err);
- return err;
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
}
/* Sync local information about quota modifications with global quota file.
@@ -506,7 +479,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
{
int err, err2;
struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_global_disk_dqblk dqblk;
s64 spacechange, inodechange;
@@ -535,9 +508,11 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
olditime = dquot->dq_dqb.dqb_itime;
oldbtime = dquot->dq_dqb.dqb_btime;
ocfs2_global_disk2memdqb(dquot, &dqblk);
- mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
- dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
- dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+ trace_ocfs2_sync_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_dqb.dqb_curspace,
+ (long long)spacechange,
+ dquot->dq_dqb.dqb_curinodes,
+ (long long)inodechange);
if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
dquot->dq_dqb.dqb_curspace += spacechange;
if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
@@ -584,9 +559,9 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
spin_unlock(&dq_data_lock);
err = ocfs2_qinfo_lock(info, freeing);
if (err < 0) {
- mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
- " (type=%d, id=%u)\n", dquot->dq_type,
- (unsigned)dquot->dq_id);
+ mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
+ " (type=%d, id=%u)\n", dquot->dq_id.type,
+ (unsigned)from_kqid(&init_user_ns, dquot->dq_id));
goto out;
}
if (freeing)
@@ -621,9 +596,10 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
struct ocfs2_super *osb = OCFS2_SB(sb);
int status = 0;
- mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
- dquot->dq_type, type, sb->s_id);
- if (type != dquot->dq_type)
+ trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type,
+ type, sb->s_id);
+ if (type != dquot->dq_id.type)
goto out;
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
@@ -637,19 +613,17 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
}
mutex_lock(&sb_dqopt(sb)->dqio_mutex);
status = ocfs2_sync_dquot(dquot);
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
if (status < 0)
mlog_errno(status);
/* We have to write local structure as well... */
- dquot_mark_dquot_dirty(dquot);
- status = dquot_commit(dquot);
+ status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
return status;
}
@@ -661,8 +635,8 @@ static void qsync_work_fn(struct work_struct *work)
struct super_block *sb = oinfo->dqi_gqinode->i_sb;
dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
- queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
- msecs_to_jiffies(oinfo->dqi_syncms));
+ schedule_delayed_work(&oinfo->dqi_sync_work,
+ msecs_to_jiffies(oinfo->dqi_syncms));
}
/*
@@ -675,7 +649,8 @@ static int ocfs2_write_dquot(struct dquot *dquot)
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
- mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+ trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type);
handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
if (IS_ERR(handle)) {
@@ -683,10 +658,11 @@ static int ocfs2_write_dquot(struct dquot *dquot)
mlog_errno(status);
goto out;
}
- status = dquot_commit(dquot);
+ mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ status = ocfs2_local_write_dquot(dquot);
+ mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out:
- mlog_exit(status);
return status;
}
@@ -704,51 +680,183 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
OCFS2_INODE_UPDATE_CREDITS;
}
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dquot_drop_work);
+ struct llist_node *list;
+ struct ocfs2_dquot *odquot, *next_odquot;
+
+ list = llist_del_all(&osb->dquot_drop_list);
+ llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+ /* Drop the reference we acquired in ocfs2_dquot_release() */
+ dqput(&odquot->dq_dquot);
+ }
+}
+
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
static int ocfs2_release_dquot(struct dquot *dquot)
{
handle_t *handle;
struct ocfs2_mem_dqinfo *oinfo =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv;
struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
- mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+ trace_ocfs2_release_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ dquot->dq_id.type);
+ mutex_lock(&dquot->dq_lock);
+ /* Check whether we are not racing with some other dqget() */
+ if (atomic_read(&dquot->dq_count) > 1)
+ goto out;
+ /* Running from downconvert thread? Postpone quota processing to wq */
+ if (current == osb->dc_task) {
+ /*
+ * Grab our own reference to dquot and queue it for delayed
+ * dropping. Quota code rechecks after calling
+ * ->release_dquot() and won't free dquot structure.
+ */
+ dqgrab(dquot);
+ /* First entry on list -> queue work */
+ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+ queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ goto out;
+ }
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
handle = ocfs2_start_trans(osb,
- ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+ ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_id.type));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out_ilock;
}
- status = dquot_release(dquot);
+
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ status = ocfs2_local_release_dquot(handle, dquot);
+ /*
+ * If we fail here, we cannot do much as global structure is
+ * already released. So just complain...
+ */
+ if (status < 0)
+ mlog_errno(status);
+ /*
+ * Clear dq_off so that we search for the structure in quota file next
+ * time we acquire it. The structure might be deleted and reallocated
+ * elsewhere by another node while our dquot structure is on freelist.
+ */
+ dquot->dq_off = 0;
+ clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
+ mutex_unlock(&dquot->dq_lock);
+ if (status)
+ mlog_errno(status);
return status;
}
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
static int ocfs2_acquire_dquot(struct dquot *dquot)
{
- struct ocfs2_mem_dqinfo *oinfo =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
- int status = 0;
+ int status = 0, err;
+ int ex = 0;
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int type = dquot->dq_id.type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct inode *gqinode = info->dqi_gqinode;
+ int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+ handle_t *handle;
- mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
- /* We need an exclusive lock, because we're going to update use count
- * and instantiate possibly new dquot structure */
- status = ocfs2_lock_global_qf(oinfo, 1);
+ trace_ocfs2_acquire_dquot(from_kqid(&init_user_ns, dquot->dq_id),
+ type);
+ mutex_lock(&dquot->dq_lock);
+ /*
+ * We need an exclusive lock, because we're going to update use count
+ * and instantiate possibly new dquot structure
+ */
+ status = ocfs2_lock_global_qf(info, 1);
if (status < 0)
goto out;
- status = dquot_acquire(dquot);
- ocfs2_unlock_global_qf(oinfo, 1);
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_dq;
+ /*
+ * We always want to read dquot structure from disk because we don't
+ * know what happened with it while it was on freelist.
+ */
+ status = qtree_read_dquot(&info->dqi_gi, dquot);
+ ocfs2_qinfo_unlock(info, 0);
+ if (status < 0)
+ goto out_dq;
+
+ OCFS2_DQUOT(dquot)->dq_use_count++;
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ if (!dquot->dq_off) { /* No real quota entry? */
+ ex = 1;
+ /*
+ * Add blocks to quota file before we start a transaction since
+ * locking allocators ranks above a transaction start
+ */
+ WARN_ON(journal_current_handle());
+ status = ocfs2_extend_no_holes(gqinode, NULL,
+ i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
+ i_size_read(gqinode));
+ if (status < 0)
+ goto out_dq;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_global_qinit_credits(sb, type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto out_dq;
+ }
+ status = ocfs2_qinfo_lock(info, ex);
+ if (status < 0)
+ goto out_trans;
+ status = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (ex && info_dirty(sb_dqinfo(sb, type))) {
+ err = __ocfs2_global_write_info(sb, type);
+ if (!status)
+ status = err;
+ }
+ ocfs2_qinfo_unlock(info, ex);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_dq:
+ ocfs2_unlock_global_qf(info, 1);
+ if (status < 0)
+ goto out;
+
+ status = ocfs2_create_local_dquot(dquot);
+ if (status < 0)
+ goto out;
+ set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out:
- mlog_exit(status);
+ mutex_unlock(&dquot->dq_lock);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -763,13 +871,13 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
int sync = 0;
int status;
struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(sb);
- mlog_entry("id=%u, type=%d", dquot->dq_id, type);
- dquot_mark_dquot_dirty(dquot);
+ trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
+ type);
/* In case user set some limits, sync dquot immediately to global
* quota file so that information propagates quicker */
@@ -792,19 +900,22 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
mlog_errno(status);
goto out_ilock;
}
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
- goto out_trans;
+ goto out_dlock;
}
/* Now write updated local dquot structure */
- status = dquot_commit(dquot);
-out_trans:
+ status = ocfs2_local_write_dquot(dquot);
+out_dlock:
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -815,8 +926,6 @@ static int ocfs2_write_info(struct super_block *sb, int type)
int status = 0;
struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
- mlog_entry_void();
-
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
@@ -831,7 +940,8 @@ static int ocfs2_write_info(struct super_block *sb, int type)
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -851,7 +961,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
}
const struct dquot_operations ocfs2_quota_operations = {
- .write_dquot = ocfs2_write_dquot,
+ /* We never make dquot dirty so .write_dquot is never called */
.acquire_dquot = ocfs2_acquire_dquot,
.release_dquot = ocfs2_release_dquot,
.mark_dirty = ocfs2_mark_dquot_dirty,
@@ -859,20 +969,3 @@ const struct dquot_operations ocfs2_quota_operations = {
.alloc_dquot = ocfs2_alloc_dquot,
.destroy_dquot = ocfs2_destroy_dquot,
};
-
-int ocfs2_quota_setup(void)
-{
- ocfs2_quota_wq = create_workqueue("o2quot");
- if (!ocfs2_quota_wq)
- return -ENOMEM;
- return 0;
-}
-
-void ocfs2_quota_shutdown(void)
-{
- if (ocfs2_quota_wq) {
- flush_workqueue(ocfs2_quota_wq);
- destroy_workqueue(ocfs2_quota_wq);
- ocfs2_quota_wq = NULL;
- }
-}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262..2001862bf2b 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,11 +3,11 @@
*/
#include <linux/fs.h>
+#include <linux/slab.h>
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/module.h>
-#define MLOG_MASK_PREFIX ML_QUOTA
#include <cluster/masklog.h>
#include "ocfs2_fs.h"
@@ -21,6 +21,8 @@
#include "dlmglue.h"
#include "quota.h"
#include "uptodate.h"
+#include "super.h"
+#include "ocfs2_trace.h"
/* Number of local quota structures per block */
static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -118,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
lock_buffer(bh);
modify(bh, private);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- ocfs2_commit_trans(OCFS2_SB(sb), handle);
- return status;
- }
+ ocfs2_journal_dirty(handle, bh);
+
status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
if (status < 0) {
mlog_errno(status);
@@ -132,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
return 0;
}
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+ ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested "
+ "to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+ return -EIO;
+ }
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+ ocfs2_validate_quota_block);
+ if (rc)
+ mlog_errno(rc);
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
/* Check whether we understand format of quota files */
static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
{
@@ -373,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
int status = 0;
struct ocfs2_quota_recovery *rec;
- mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
rec = ocfs2_alloc_quota_recovery();
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -444,7 +477,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
struct ocfs2_recovery_chunk *rchunk, *next;
qsize_t spacechange, inodechange;
- mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+ trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
chunk = rchunk->rc_chunk;
@@ -468,7 +501,9 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
}
dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
ol_dqblk_block_off(sb, chunk, bit));
- dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+ dquot = dqget(sb,
+ make_kqid(&init_user_ns, type,
+ le64_to_cpu(dqblk->dqb_id)));
if (!dquot) {
status = -EIO;
mlog(ML_ERROR, "Failed to get quota structure "
@@ -518,13 +553,11 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
goto out_commit;
}
lock_buffer(qbh);
- WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
- ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(qbh);
- status = ocfs2_journal_dirty(handle, qbh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, qbh);
out_commit:
mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -546,7 +579,8 @@ out_put_bh:
}
if (status < 0)
free_recovery_list(&(rec->r_list[type]));
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -566,12 +600,14 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
struct inode *lqinode;
unsigned int flags;
- mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+ "slot %u\n", osb->dev_str, slot_num);
+
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
for (type = 0; type < MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
- mlog(0, "Recovering quota in slot %d\n", slot_num);
+ trace_ocfs2_finish_quota_recovery(slot_num);
lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
if (!lqinode) {
status = -ENOENT;
@@ -582,8 +618,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
/* Someone else is holding the lock? Then he must be
* doing the recovery. Just skip the file... */
if (status == -EAGAIN) {
- mlog(ML_NOTICE, "skipping quota recovery for slot %d "
- "because quota file is locked.\n", slot_num);
+ printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+ "device (%s) for slot %d because quota file is "
+ "locked.\n", osb->dev_str, slot_num);
status = 0;
goto out_put;
} else if (status < 0) {
@@ -630,9 +667,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
lock_buffer(bh);
ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bh);
out_trans:
ocfs2_commit_trans(osb, handle);
out_bh:
@@ -678,7 +713,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
INIT_LIST_HEAD(&oinfo->dqi_chunk);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
- oinfo->dqi_ibh = NULL;
+ oinfo->dqi_libh = NULL;
status = ocfs2_global_read_info(sb, type);
if (status < 0)
@@ -704,7 +739,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
- oinfo->dqi_ibh = bh;
+ oinfo->dqi_libh = bh;
/* We crashed when using local quota file? */
if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -766,7 +801,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
- ->dqi_ibh;
+ ->dqi_libh;
int status;
status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -789,10 +824,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
int mark_clean = 1, len;
int status;
- /* At this point we know there are no more dquots and thus
- * even if there's some sync in the pdflush queue, it won't
- * find any dquots and return without doing anything */
- cancel_delayed_work_sync(&oinfo->dqi_sync_work);
iput(oinfo->dqi_gqinode);
ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -827,7 +858,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
/* Mark local file as clean */
info->dqi_flags |= OLQF_CLEAN;
status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
- oinfo->dqi_ibh,
+ oinfo->dqi_libh,
olq_update_info,
info);
if (status < 0) {
@@ -837,7 +868,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
out:
ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
- brelse(oinfo->dqi_ibh);
+ brelse(oinfo->dqi_libh);
brelse(oinfo->dqi_lqi_bh);
kfree(oinfo);
return 0;
@@ -852,35 +883,36 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+ ol_dqblk_block_offset(sb, od->dq_local_off));
- dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+ dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
+ od->dq_dquot.dq_id));
spin_lock(&dq_data_lock);
dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
od->dq_origspace);
dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
od->dq_originodes);
spin_unlock(&dq_data_lock);
- mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
- od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
- (long long)le64_to_cpu(dqblk->dqb_inodemod));
+ trace_olq_set_dquot(
+ (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+ (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+ from_kqid(&init_user_ns, od->dq_dquot.dq_id));
}
/* Write dquot to local quota file */
-static int ocfs2_local_write_dquot(struct dquot *dquot)
+int ocfs2_local_write_dquot(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
+ struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type];
int status;
- status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
- ol_dqblk_file_block(sb, od->dq_local_off),
- &bh);
+ status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
+ &bh);
if (status) {
mlog_errno(status);
goto out;
}
- status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
- olq_set_dquot, od);
+ status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -920,7 +952,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
* ol_quota_entries_per_block(sb);
}
- found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
/* We failed? */
if (found == len) {
mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -949,15 +981,15 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
- lqinode->i_size + 2 * sb->s_blocksize,
- lqinode->i_size);
+ status = ocfs2_extend_no_holes(lqinode, NULL,
+ i_size_read(lqinode) + 2 * sb->s_blocksize,
+ i_size_read(lqinode));
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
- lqinode->i_size + 2 * sb->s_blocksize);
+ i_size_read(lqinode) + 2 * sb->s_blocksize);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -980,10 +1012,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
}
/* Initialize chunk header */
- down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
&p_blkno, NULL, NULL);
- up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
if (status < 0) {
mlog_errno(status);
goto out_trans;
@@ -1008,17 +1038,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
OCFS2_QBLK_RESERVED_SPACE);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, bh);
/* Initialize new block with structures */
- down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
&p_blkno, NULL, NULL);
- up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
if (status < 0) {
mlog_errno(status);
goto out_trans;
@@ -1039,11 +1063,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
lock_buffer(dbh);
memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
unlock_buffer(dbh);
- status = ocfs2_journal_dirty(handle, dbh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, dbh);
/* Update local quotafile info */
oinfo->dqi_blocks += 2;
@@ -1104,25 +1124,23 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
return ocfs2_local_quota_add_chunk(sb, type, offset);
/* We are protected by dqio_sem so no locking needed */
- status = ocfs2_extend_no_holes(lqinode,
- lqinode->i_size + sb->s_blocksize,
- lqinode->i_size);
+ status = ocfs2_extend_no_holes(lqinode, NULL,
+ i_size_read(lqinode) + sb->s_blocksize,
+ i_size_read(lqinode));
if (status < 0) {
mlog_errno(status);
goto out;
}
status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
- lqinode->i_size + sb->s_blocksize);
+ i_size_read(lqinode) + sb->s_blocksize);
if (status < 0) {
mlog_errno(status);
goto out;
}
/* Get buffer from the just added block */
- down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
&p_blkno, NULL, NULL);
- up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -1154,11 +1172,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
lock_buffer(bh);
memset(bh->b_data, 0, sb->s_blocksize);
unlock_buffer(bh);
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, bh);
+
/* Update chunk header */
status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
chunk->qc_headerbh,
@@ -1172,11 +1187,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
lock_buffer(chunk->qc_headerbh);
le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
unlock_buffer(chunk->qc_headerbh);
- status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
- if (status < 0) {
- mlog_errno(status);
- goto out_trans;
- }
+ ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+
/* Update file header */
oinfo->dqi_blocks++;
status = ocfs2_local_write_info(sb, type);
@@ -1204,31 +1216,41 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
struct ocfs2_local_disk_chunk *dchunk;
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
- ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, -1);
}
/* Create dquot in the local file for given id */
-static int ocfs2_create_local_dquot(struct dquot *dquot)
+int ocfs2_create_local_dquot(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct inode *lqinode = sb_dqopt(sb)->files[type];
struct ocfs2_quota_chunk *chunk;
struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
int offset;
int status;
+ u64 pcount;
+ down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
chunk = ocfs2_find_free_entry(sb, type, &offset);
if (!chunk) {
chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
- if (IS_ERR(chunk))
- return PTR_ERR(chunk);
+ if (IS_ERR(chunk)) {
+ status = PTR_ERR(chunk);
+ goto out;
+ }
} else if (IS_ERR(chunk)) {
- return PTR_ERR(chunk);
+ status = PTR_ERR(chunk);
+ goto out;
}
od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
od->dq_chunk = chunk;
+ status = ocfs2_extent_map_get_blocks(lqinode,
+ ol_dqblk_block(sb, chunk->qc_num, offset),
+ &od->dq_local_phys_blk,
+ &pcount,
+ NULL);
/* Initialize dquot structure on disk */
status = ocfs2_local_write_dquot(dquot);
@@ -1245,55 +1267,22 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
goto out;
}
out:
+ up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
return status;
}
-/* Create entry in local file for dquot, load data from the global file */
-static int ocfs2_local_read_dquot(struct dquot *dquot)
-{
- int status;
-
- mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
-
- status = ocfs2_global_read_dquot(dquot);
- if (status < 0) {
- mlog_errno(status);
- goto out_err;
- }
-
- /* Now create entry in the local quota file */
- status = ocfs2_create_local_dquot(dquot);
- if (status < 0) {
- mlog_errno(status);
- goto out_err;
- }
- mlog_exit(0);
- return 0;
-out_err:
- mlog_exit(status);
- return status;
-}
-
-/* Release dquot structure from local quota file. ocfs2_release_dquot() has
- * already started a transaction and obtained exclusive lock for global
- * quota file. */
-static int ocfs2_local_release_dquot(struct dquot *dquot)
+/*
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and written all changes to global quota file
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
{
int status;
- int type = dquot->dq_type;
+ int type = dquot->dq_id.type;
struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
struct super_block *sb = dquot->dq_sb;
struct ocfs2_local_disk_chunk *dchunk;
int offset;
- handle_t *handle = journal_current_handle();
-
- BUG_ON(!handle);
- /* First write all local changes to global file */
- status = ocfs2_global_release_dquot(dquot);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
status = ocfs2_journal_access_dq(handle,
INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1308,20 +1297,12 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
(od->dq_chunk->qc_headerbh->b_data);
/* Mark structure as freed */
lock_buffer(od->dq_chunk->qc_headerbh);
- ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
le32_add_cpu(&dchunk->dqc_free, 1);
unlock_buffer(od->dq_chunk->qc_headerbh);
- status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
- if (status < 0) {
- mlog_errno(status);
- goto out;
- }
- status = 0;
+ ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+
out:
- /* Clear the read bit so that next time someone uses this
- * dquot he reads fresh info from disk and allocates local
- * dquot structure */
- clear_bit(DQ_READ_B, &dquot->dq_flags);
return status;
}
@@ -1330,9 +1311,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
.read_file_info = ocfs2_local_read_info,
.write_file_info = ocfs2_global_write_info,
.free_file_info = ocfs2_local_free_info,
- .read_dqblk = ocfs2_local_read_dquot,
- .commit_dqblk = ocfs2_local_write_dquot,
- .release_dqblk = ocfs2_local_release_dquot,
};
struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffd..636aab69ead 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -16,7 +16,6 @@
*/
#include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_REFCOUNT
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "inode.h"
@@ -34,10 +33,10 @@
#include "aops.h"
#include "xattr.h"
#include "namei.h"
+#include "ocfs2_trace.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
@@ -47,6 +46,7 @@
#include <linux/quotaops.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/posix_acl.h>
struct ocfs2_cow_context {
struct inode *inode;
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
- struct ocfs2_cow_context *context,
+ struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
@@ -84,8 +84,7 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)bh->b_data;
- mlog(0, "Validating refcount block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -545,8 +544,8 @@ void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
while ((node = rb_last(root)) != NULL) {
tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
- mlog(0, "Purge tree %llu\n",
- (unsigned long long) tree->rf_blkno);
+ trace_ocfs2_purge_refcount_trees(
+ (unsigned long long) tree->rf_blkno);
rb_erase(&tree->rf_node, root);
ocfs2_free_refcount_tree(tree);
@@ -571,11 +570,12 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
- mlog(0, "create tree for inode %lu\n", inode->i_ino);
+ trace_ocfs2_create_refcount_tree(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
if (ret) {
@@ -597,7 +597,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
goto out_commit;
}
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&first_blkno);
if (ret) {
@@ -613,6 +613,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
@@ -627,6 +632,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
memset(rb, 0, inode->i_sb->s_blocksize);
strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -645,8 +651,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
di->i_refcount_loc = cpu_to_le64(first_blkno);
spin_unlock(&oi->ip_lock);
- mlog(0, "created tree for inode %lu, refblock %llu\n",
- inode->i_ino, (unsigned long long)first_blkno);
+ trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
ocfs2_journal_dirty(handle, di_bh);
@@ -791,7 +796,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
if (le32_to_cpu(rb->rf_count) == 1) {
blk = le64_to_cpu(rb->rf_blkno);
bit = le16_to_cpu(rb->rf_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (rb->rf_suballoc_loc)
+ bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
alloc_inode = ocfs2_get_system_file_inode(osb,
EXTENT_ALLOC_SYSTEM_INODE,
@@ -1033,14 +1041,14 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
tmp_el = left_path->p_node[subtree_root].el;
blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
- for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+ for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
break;
}
}
- BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+ BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
out:
ocfs2_free_path(left_path);
@@ -1252,8 +1260,9 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
goto out;
}
- mlog(0, "change index %d, old count %u, change %d\n", index,
- le32_to_cpu(rec->r_refcount), change);
+ trace_ocfs2_change_refcount_rec(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ index, le32_to_cpu(rec->r_refcount), change);
le32_add_cpu(&rec->r_refcount, change);
if (!rec->r_refcount) {
@@ -1269,9 +1278,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
} else if (merge)
ocfs2_refcount_rec_merge(rb, index);
- ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
return ret;
}
@@ -1285,7 +1292,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
int ret;
u16 suballoc_bit_start;
u32 num_got;
- u64 blkno;
+ u64 suballoc_loc, blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct buffer_head *new_bh = NULL;
struct ocfs2_refcount_block *new_rb;
@@ -1299,7 +1306,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
goto out;
}
- ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&blkno);
if (ret) {
@@ -1309,7 +1316,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
@@ -1331,6 +1338,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
new_rb->rf_blkno = cpu_to_le64(blkno);
new_rb->rf_cpos = cpu_to_le32(0);
@@ -1350,8 +1358,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
ocfs2_journal_dirty(handle, ref_root_bh);
- mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
- le16_to_cpu(new_rb->rf_records.rl_used));
+ trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+ le16_to_cpu(new_rb->rf_records.rl_used));
*ref_leaf_bh = new_bh;
new_bh = NULL;
@@ -1400,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size)
{
struct ocfs2_refcount_rec *l = a, *r = b, tmp;
- tmp = *(struct ocfs2_refcount_rec *)l;
- *(struct ocfs2_refcount_rec *)l =
- *(struct ocfs2_refcount_rec *)r;
- *(struct ocfs2_refcount_rec *)r = tmp;
+ tmp = *l;
+ *l = *r;
+ *r = tmp;
}
/*
@@ -1463,9 +1470,9 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
(struct ocfs2_refcount_block *)new_bh->b_data;
struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
- mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
- (unsigned long long)ref_leaf_bh->b_blocknr,
- le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+ trace_ocfs2_divide_leaf_refcount_block(
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
/*
* XXX: Improvement later.
@@ -1525,7 +1532,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
int ret;
u16 suballoc_bit_start;
u32 num_got, new_cpos;
- u64 blkno;
+ u64 suballoc_loc, blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *root_rb =
(struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1549,7 +1556,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
goto out;
}
- ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&blkno);
if (ret) {
@@ -1559,7 +1566,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
@@ -1577,6 +1584,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
memset(new_rb, 0, sb->s_blocksize);
strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+ new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1597,8 +1605,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
- mlog(0, "insert new leaf block %llu at %u\n",
- (unsigned long long)new_bh->b_blocknr, new_cpos);
+ trace_ocfs2_new_leaf_refcount_block(
+ (unsigned long long)new_bh->b_blocknr, new_cpos);
/* Insert the new leaf block with the specific offset cpos. */
ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
@@ -1695,7 +1703,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
* 2 more credits, one for the leaf refcount block, one for
* the extent block contains the extent rec.
*/
- ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+ ret = ocfs2_extend_trans(handle, 2);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1790,11 +1798,10 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
(le16_to_cpu(rf_list->rl_used) - index) *
sizeof(struct ocfs2_refcount_rec));
- mlog(0, "insert refcount record start %llu, len %u, count %u "
- "to leaf block %llu at index %d\n",
- (unsigned long long)le64_to_cpu(rec->r_cpos),
- le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
- (unsigned long long)ref_leaf_bh->b_blocknr, index);
+ trace_ocfs2_insert_refcount_rec(
+ (unsigned long long)ref_leaf_bh->b_blocknr, index,
+ (unsigned long long)le64_to_cpu(rec->r_cpos),
+ le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
rf_list->rl_recs[index] = *rec;
@@ -1803,11 +1810,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
if (merge)
ocfs2_refcount_rec_merge(rb, index);
- ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
if (index == 0) {
ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1850,10 +1853,12 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
- mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
- le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
- le64_to_cpu(split_rec->r_cpos),
- le32_to_cpu(split_rec->r_clusters));
+ trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+ le32_to_cpu(orig_rec->r_clusters),
+ le32_to_cpu(orig_rec->r_refcount),
+ le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount));
/*
* If we just need to split the header or tail clusters,
@@ -1967,20 +1972,17 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
if (split_rec->r_refcount) {
rf_list->rl_recs[index] = *split_rec;
- mlog(0, "insert refcount record start %llu, len %u, count %u "
- "to leaf block %llu at index %d\n",
- (unsigned long long)le64_to_cpu(split_rec->r_cpos),
- le32_to_cpu(split_rec->r_clusters),
- le32_to_cpu(split_rec->r_refcount),
- (unsigned long long)ref_leaf_bh->b_blocknr, index);
+ trace_ocfs2_split_refcount_rec_insert(
+ (unsigned long long)ref_leaf_bh->b_blocknr, index,
+ (unsigned long long)le64_to_cpu(split_rec->r_cpos),
+ le32_to_cpu(split_rec->r_clusters),
+ le32_to_cpu(split_rec->r_refcount));
if (merge)
ocfs2_refcount_rec_merge(rb, index);
}
- ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
- if (ret)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
brelse(new_bh);
@@ -1999,7 +2001,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
struct ocfs2_refcount_rec rec;
unsigned int set_len = 0;
- mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+ trace_ocfs2_increase_refcount_begin(
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)cpos, len);
@@ -2026,9 +2028,9 @@ static int __ocfs2_increase_refcount(handle_t *handle,
*/
if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
set_len <= len) {
- mlog(0, "increase refcount rec, start %llu, len %u, "
- "count %u\n", (unsigned long long)cpos, set_len,
- le32_to_cpu(rec.r_refcount));
+ trace_ocfs2_increase_refcount_change(
+ (unsigned long long)cpos, set_len,
+ le32_to_cpu(rec.r_refcount));
ret = ocfs2_change_refcount_rec(handle, ci,
ref_leaf_bh, index,
merge, 1);
@@ -2039,7 +2041,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
} else if (!rec.r_refcount) {
rec.r_refcount = cpu_to_le32(1);
- mlog(0, "insert refcount rec, start %llu, len %u\n",
+ trace_ocfs2_increase_refcount_insert(
(unsigned long long)le64_to_cpu(rec.r_cpos),
set_len);
ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
@@ -2057,8 +2059,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
rec.r_clusters = cpu_to_le32(set_len);
le32_add_cpu(&rec.r_refcount, 1);
- mlog(0, "split refcount rec, start %llu, "
- "len %u, count %u\n",
+ trace_ocfs2_increase_refcount_split(
(unsigned long long)le64_to_cpu(rec.r_cpos),
set_len, le32_to_cpu(rec.r_refcount));
ret = ocfs2_split_refcount_rec(handle, ci,
@@ -2097,6 +2098,11 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
BUG_ON(rb->rf_records.rl_used);
+ trace_ocfs2_remove_refcount_extent(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)ref_leaf_bh->b_blocknr,
+ le32_to_cpu(rb->rf_cpos));
+
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
1, meta_ac, dealloc);
@@ -2113,6 +2119,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
*/
ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(rb->rf_suballoc_slot),
+ le64_to_cpu(rb->rf_suballoc_loc),
le64_to_cpu(rb->rf_blkno),
le16_to_cpu(rb->rf_suballoc_bit));
if (ret) {
@@ -2138,7 +2145,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
if (!rb->rf_list.l_next_free_rec) {
BUG_ON(rb->rf_clusters);
- mlog(0, "reset refcount tree root %llu to be a record block.\n",
+ trace_ocfs2_restore_refcount_block(
(unsigned long long)ref_root_bh->b_blocknr);
rb->rf_flags = 0;
@@ -2185,6 +2192,10 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
BUG_ON(cpos + len >
le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+ trace_ocfs2_decrease_refcount_rec(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len);
+
if (cpos == le64_to_cpu(rec->r_cpos) &&
len == le32_to_cpu(rec->r_clusters))
ret = ocfs2_change_refcount_rec(handle, ci,
@@ -2196,12 +2207,6 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
le32_add_cpu(&split.r_refcount, -1);
- mlog(0, "split refcount rec, start %llu, "
- "len %u, count %u, original start %llu, len %u\n",
- (unsigned long long)le64_to_cpu(split.r_cpos),
- len, le32_to_cpu(split.r_refcount),
- (unsigned long long)le64_to_cpu(rec->r_cpos),
- le32_to_cpu(rec->r_clusters));
ret = ocfs2_split_refcount_rec(handle, ci,
ref_root_bh, ref_leaf_bh,
&split, index, 1,
@@ -2240,10 +2245,9 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct buffer_head *ref_leaf_bh = NULL;
- mlog(0, "Tree owner %llu, decrease refcount start %llu, "
- "len %u, delete %u\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)cpos, len, delete);
+ trace_ocfs2_decrease_refcount(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)cpos, len, delete);
while (len) {
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -2353,8 +2357,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
{
int ret;
- mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
- inode->i_ino, cpos, len, phys);
+ trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+ cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -2393,8 +2397,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
u32 len;
- mlog(0, "start_cpos %llu, clusters %u\n",
- (unsigned long long)start_cpos, clusters);
while (clusters) {
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
cpos, clusters, &rec,
@@ -2413,7 +2415,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
rb = (struct ocfs2_refcount_block *)
prev_bh->b_data;
- if (le64_to_cpu(rb->rf_records.rl_used) +
+ if (le16_to_cpu(rb->rf_records.rl_used) +
recs_add >
le16_to_cpu(rb->rf_records.rl_count))
ref_blocks++;
@@ -2428,26 +2430,35 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
- mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
- "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
- recs_add, (unsigned long long)cpos, clusters,
- (unsigned long long)le64_to_cpu(rec.r_cpos),
- le32_to_cpu(rec.r_clusters),
- le32_to_cpu(rec.r_refcount), index);
+ trace_ocfs2_calc_refcount_meta_credits_iterate(
+ recs_add, (unsigned long long)cpos, clusters,
+ (unsigned long long)le64_to_cpu(rec.r_cpos),
+ le32_to_cpu(rec.r_clusters),
+ le32_to_cpu(rec.r_refcount), index);
len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - cpos;
/*
- * If the refcount rec already exist, cool. We just need
- * to check whether there is a split. Otherwise we just need
- * to increase the refcount.
- * If we will insert one, increases recs_add.
- *
* We record all the records which will be inserted to the
* same refcount block, so that we can tell exactly whether
* we need a new refcount block or not.
+ *
+ * If we will insert a new one, this is easy and only happens
+ * during adding refcounted flag to the extent, so we don't
+ * have a chance of spliting. We just need one record.
+ *
+ * If the refcount rec already exists, that would be a little
+ * complicated. we may have to:
+ * 1) split at the beginning if the start pos isn't aligned.
+ * we need 1 more record in this case.
+ * 2) split int the end if the end pos isn't aligned.
+ * we need 1 more record in this case.
+ * 3) split in the middle because of file system fragmentation.
+ * we need 2 more records in this case(we can't detect this
+ * beforehand, so always think of the worst case).
*/
if (rec.r_refcount) {
+ recs_add += 2;
/* Check whether we need a split at the beginning. */
if (cpos == start_cpos &&
cpos != le64_to_cpu(rec.r_cpos))
@@ -2469,7 +2480,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
if (prev_bh) {
rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
- if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
+ if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
le16_to_cpu(rb->rf_records.rl_count))
ref_blocks++;
@@ -2479,7 +2490,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
if (!ref_blocks)
goto out;
- mlog(0, "we need ref_blocks %d\n", ref_blocks);
*meta_add += ref_blocks;
*credits += ref_blocks;
@@ -2497,14 +2507,17 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
*credits += ocfs2_calc_extend_credits(sb,
- et.et_root_el,
- ref_blocks);
+ et.et_root_el);
} else {
*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
*meta_add += 1;
}
out:
+
+ trace_ocfs2_calc_refcount_meta_credits(
+ (unsigned long long)start_cpos, clusters,
+ *meta_add, *credits);
brelse(ref_leaf_bh);
brelse(prev_bh);
return ret;
@@ -2517,20 +2530,19 @@ out:
*
* Normally the refcount blocks store these refcount should be
* contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
- * 2 more refcount block, so just check it in a rough way.
+ * We will at most add split 2 refcount records and 2 more
+ * refcount blocks, so just check it in a rough way.
*
* Caller must hold refcount tree lock.
*/
int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
- struct buffer_head *di_bh,
+ u64 refcount_loc,
u64 phys_blkno,
u32 clusters,
int *credits,
- struct ocfs2_alloc_context **meta_ac)
+ int *ref_blocks)
{
- int ret, ref_blocks = 0;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ int ret;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
@@ -2547,14 +2559,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_refcount_loc), &tree);
+ refcount_loc, &tree);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_read_refcount_block(&tree->rf_ci,
- le64_to_cpu(di->i_refcount_loc),
+ ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
&ref_root_bh);
if (ret) {
mlog_errno(ret);
@@ -2565,21 +2576,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
&tree->rf_ci,
ref_root_bh,
start_cpos, clusters,
- &ref_blocks, credits);
+ ref_blocks, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
- mlog(0, "reserve new metadata %d, credits = %d\n",
- ref_blocks, *credits);
-
- if (ref_blocks) {
- ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
- ref_blocks, meta_ac);
- if (ret)
- mlog_errno(ret);
- }
+ trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
out:
brelse(ref_root_bh);
@@ -2875,8 +2878,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
meta_add =
ocfs2_extend_meta_needed(et->et_root_el);
- *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
- num_clusters + 2);
+ *credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
p_cluster, num_clusters,
@@ -2886,8 +2888,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
goto out;
}
- mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
- meta_add, num_clusters, *credits);
+ trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
meta_ac);
if (ret) {
@@ -2922,26 +2923,34 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
- struct ocfs2_cow_context *context,
- u32 cpos, u32 old_cluster,
- u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
- struct ocfs2_caching_info *ci = context->data_et.et_ci;
- struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+ struct super_block *sb = inode->i_sb;
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
- unsigned int from, to;
+ unsigned int from, to, readahead_pages;
loff_t offset, end, map_end;
- struct address_space *mapping = context->inode->i_mapping;
+ struct address_space *mapping = inode->i_mapping;
- mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
- new_cluster, new_len, cpos);
+ trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+ new_cluster, new_len);
+ readahead_pages =
+ (ocfs2_cow_contig_clusters(sb) <<
+ OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+ /*
+ * We only duplicate pages until we reach the page contains i_size - 1.
+ * So trim 'end' to i_size.
+ */
+ if (end > i_size_read(inode))
+ end = i_size_read(inode);
while (offset < end) {
page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2955,7 +2964,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (map_end & (PAGE_CACHE_SIZE - 1))
to = map_end & (PAGE_CACHE_SIZE - 1);
- page = grab_cache_page(mapping, page_index);
+ page = find_or_create_page(mapping, page_index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ break;
+ }
/*
* In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2983,7 +2997,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
}
- ocfs2_map_and_dirty_page(context->inode,
+ ocfs2_map_and_dirty_page(inode,
handle, from, to,
page, 0, &new_block);
mark_page_accessed(page);
@@ -2999,14 +3013,14 @@ unlock:
return ret;
}
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
- struct ocfs2_cow_context *context,
- u32 cpos, u32 old_cluster,
- u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len)
{
int ret = 0;
- struct super_block *sb = context->inode->i_sb;
- struct ocfs2_caching_info *ci = context->data_et.et_ci;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3014,13 +3028,13 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct buffer_head *old_bh = NULL;
struct buffer_head *new_bh = NULL;
- mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
- new_cluster, new_len);
+ trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+ new_cluster, new_len);
for (i = 0; i < blocks; i++, old_block++, new_block++) {
new_bh = sb_getblk(osb->sb, new_block);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
break;
}
@@ -3041,11 +3055,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
}
memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
- ret = ocfs2_journal_dirty(handle, new_bh);
- if (ret) {
- mlog_errno(ret);
- break;
- }
+ ocfs2_journal_dirty(handle, new_bh);
brelse(new_bh);
brelse(old_bh);
@@ -3072,8 +3082,8 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
- mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
- (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+ trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+ cpos, len, p_cluster, ext_flags);
memset(&replace_rec, 0, sizeof(replace_rec));
replace_rec.e_cpos = cpu_to_le32(cpos);
@@ -3128,13 +3138,13 @@ static int ocfs2_replace_clusters(handle_t *handle,
struct ocfs2_caching_info *ci = context->data_et.et_ci;
u64 ino = ocfs2_metadata_cache_owner(ci);
- mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
- (unsigned long long)ino, cpos, old, new, len, ext_flags);
+ trace_ocfs2_replace_clusters((unsigned long long)ino,
+ cpos, old, new, len, ext_flags);
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = context->cow_duplicate_clusters(handle, context, cpos,
- old, new, len);
+ ret = context->cow_duplicate_clusters(handle, context->inode,
+ cpos, old, new, len);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3150,22 +3160,22 @@ out:
return ret;
}
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
- struct ocfs2_cow_context *context,
- u32 cpos, u32 num_clusters)
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters)
{
int ret = 0;
loff_t offset, end, map_end;
pgoff_t page_index;
struct page *page;
- if (ocfs2_should_order_data(context->inode))
+ if (ocfs2_should_order_data(inode))
return 0;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
- ret = filemap_fdatawrite_range(context->inode->i_mapping,
+ ret = filemap_fdatawrite_range(inode->i_mapping,
offset, end - 1);
if (ret < 0) {
mlog_errno(ret);
@@ -3178,7 +3188,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
if (map_end > end)
map_end = end;
- page = grab_cache_page(context->inode->i_mapping, page_index);
+ page = find_or_create_page(inode->i_mapping,
+ page_index, GFP_NOFS);
BUG_ON(!page);
wait_on_page_writeback(page);
@@ -3214,7 +3225,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
u32 num_clusters, unsigned int e_flags)
{
int ret, delete, index, credits = 0;
- u32 new_bit, new_len;
+ u32 new_bit, new_len, orig_num_clusters;
unsigned int set_len;
struct ocfs2_super *osb = OCFS2_SB(sb);
handle_t *handle;
@@ -3222,8 +3233,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
struct ocfs2_refcount_rec rec;
- mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
- cpos, p_cluster, num_clusters, e_flags);
+ trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+ num_clusters, e_flags);
ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
&context->data_et,
@@ -3247,6 +3258,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
goto out;
}
+ orig_num_clusters = num_clusters;
+
while (num_clusters) {
ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
p_cluster, num_clusters,
@@ -3283,7 +3296,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
} else {
delete = 1;
- ret = __ocfs2_claim_clusters(osb, handle,
+ ret = __ocfs2_claim_clusters(handle,
context->data_ac,
1, set_len,
&new_bit, &new_len);
@@ -3334,7 +3347,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
* in write-back mode.
*/
if (context->get_clusters == ocfs2_di_get_clusters) {
- ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+ ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
+ orig_num_clusters);
if (ret)
mlog_errno(ret);
}
@@ -3435,9 +3449,9 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
goto out;
}
- mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
- "cow_len %u\n", inode->i_ino,
- cpos, write_len, cow_start, cow_len);
+ trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+ cpos, write_len, max_cpos,
+ cow_start, cow_len);
BUG_ON(cow_len == 0);
@@ -3586,7 +3600,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
* one will split a refcount rec, so totally we need
* clusters * 2 new refcount rec.
*/
- if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+ if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
le16_to_cpu(rb->rf_records.rl_count))
ref_blocks++;
@@ -3614,8 +3628,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
*credits += ocfs2_calc_extend_credits(inode->i_sb,
- et.et_root_el,
- ref_blocks);
+ et.et_root_el);
}
out:
@@ -3663,7 +3676,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
context->cow_start = cow_start;
context->cow_len = cow_len;
context->ref_tree = ref_tree;
- context->ref_root_bh = ref_root_bh;;
+ context->ref_root_bh = ref_root_bh;
context->cow_object = xv;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
@@ -3712,8 +3725,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
goto out;
}
- mlog(0, "reserve new metadata %d, credits = %d\n",
- ref_blocks, credits);
+ trace_ocfs2_add_refcount_flag(ref_blocks, credits);
if (ref_blocks) {
ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
@@ -3844,7 +3856,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
while (cpos < clusters) {
ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
-
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
ret = ocfs2_add_refcount_flag(inode, &di_et,
&ref_tree->rf_ci,
@@ -4015,7 +4030,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode,
while (cpos < clusters) {
ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
-
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
if (p_cluster) {
ret = ocfs2_add_refcounted_extent(t_inode, &et,
ref_ci, ref_root_bh,
@@ -4075,6 +4093,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
spin_unlock(&OCFS2_I(t_inode)->ip_lock);
i_size_write(t_inode, size);
+ t_inode->i_blocks = s_inode->i_blocks;
di->i_xattr_inline_size = s_di->i_xattr_inline_size;
di->i_clusters = s_di->i_clusters;
@@ -4083,6 +4102,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
di->i_attr = s_di->i_attr;
if (preserve) {
+ t_inode->i_uid = s_inode->i_uid;
+ t_inode->i_gid = s_inode->i_gid;
+ t_inode->i_mode = s_inode->i_mode;
di->i_uid = s_di->i_uid;
di->i_gid = s_di->i_gid;
di->i_mode = s_di->i_mode;
@@ -4177,6 +4199,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *new_bh = NULL;
+ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = filemap_fdatawrite(inode->i_mapping);
if (ret) {
mlog_errno(ret);
@@ -4189,8 +4217,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out;
}
- mutex_lock(&new_inode->i_mutex);
- ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+ mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
+ OI_LS_REFLINK_TARGET);
if (ret) {
mlog_errno(ret);
goto out_unlock;
@@ -4239,20 +4268,36 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
+ struct posix_acl *default_acl, *acl;
+ umode_t mode;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
- error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ mode = inode->i_mode;
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_create_inode_in_orphan(dir, mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
goto out;
}
+ error = ocfs2_rw_lock(inode, 1);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
error = ocfs2_inode_lock(inode, &old_bh, 1);
if (error) {
mlog_errno(error);
+ ocfs2_rw_unlock(inode, 1);
goto out;
}
@@ -4264,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
up_write(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
brelse(old_bh);
if (error) {
@@ -4273,11 +4319,17 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
- error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+ error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ &new_dentry->d_name,
+ default_acl, acl);
if (error)
mlog_errno(error);
}
out:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
@@ -4314,25 +4366,6 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
-/* copied from user_path_parent. */
-static int ocfs2_user_path_parent(const char __user *path,
- struct nameidata *nd, char **name)
-{
- char *s = getname(path);
- int error;
-
- if (IS_ERR(s))
- return PTR_ERR(s);
-
- error = path_lookup(s, LOOKUP_PARENT, nd);
- if (error)
- putname(s);
- else
- *name = s;
-
- return error;
-}
-
/**
* ocfs2_vfs_reflink - Create a reference-counted link
*
@@ -4372,7 +4405,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
* rights to do so.
*/
if (preserve) {
- if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
+ if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
return -EPERM;
if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
return -EPERM;
@@ -4406,10 +4439,8 @@ int ocfs2_reflink_ioctl(struct inode *inode,
bool preserve)
{
struct dentry *new_dentry;
- struct nameidata nd;
- struct path old_path;
+ struct path old_path, new_path;
int error;
- char *to = NULL;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
@@ -4420,39 +4451,24 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
- error = ocfs2_user_path_parent(newname, &nd, &to);
- if (error) {
- mlog_errno(error);
- goto out;
- }
-
- error = -EXDEV;
- if (old_path.mnt != nd.path.mnt)
- goto out_release;
- new_dentry = lookup_create(&nd, 0);
+ new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry)) {
mlog_errno(error);
- goto out_unlock;
+ goto out;
}
- error = mnt_want_write(nd.path.mnt);
- if (error) {
+ error = -EXDEV;
+ if (old_path.mnt != new_path.mnt) {
mlog_errno(error);
goto out_dput;
}
error = ocfs2_vfs_reflink(old_path.dentry,
- nd.path.dentry->d_inode,
+ new_path.dentry->d_inode,
new_dentry, preserve);
- mnt_drop_write(nd.path.mnt);
out_dput:
- dput(new_dentry);
-out_unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-out_release:
- path_put(&nd.path);
- putname(to);
+ done_path_create(&new_path, new_dentry);
out:
path_put(&old_path);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ec..6422bbcdb52 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
struct rb_node rf_node;
u64 rf_blkno;
u32 rf_generation;
+ struct kref rf_getcnt;
struct rw_semaphore rf_sem;
struct ocfs2_lock_res rf_lockres;
- struct kref rf_getcnt;
int rf_removed;
/* the following 4 fields are used by caching_info. */
- struct ocfs2_caching_info rf_ci;
spinlock_t rf_lock;
+ struct ocfs2_caching_info rf_ci;
struct mutex rf_io_mutex;
struct super_block *rf_sb;
};
@@ -47,12 +47,13 @@ int ocfs2_decrease_refcount(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt *dealloc,
int delete);
int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
- struct buffer_head *di_bh,
+ u64 refcount_loc,
u64 phys_blkno,
u32 clusters,
int *credits,
- struct ocfs2_alloc_context **meta_ac);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+ int *ref_blocks);
+int ocfs2_refcount_cow(struct inode *inode,
+ struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
typedef int (ocfs2_post_refcount_func)(struct inode *inode,
@@ -83,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
struct buffer_head *ref_root_bh,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+ struct inode *inode,
+ u32 cpos, u32 old_cluster,
+ u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+ struct inode *inode,
+ u32 cpos, u32 num_clusters);
int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_extent_tree *data_et,
struct ocfs2_caching_info *ref_ci,
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 00000000000..41ffd36c689
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,839 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_trace.h"
+
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+
+DEFINE_SPINLOCK(resv_lock);
+
+#define OCFS2_MIN_RESV_WINDOW_BITS 8
+#define OCFS2_MAX_RESV_WINDOW_BITS 1024
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+ return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ unsigned int bits;
+
+ if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+ /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+ bits = 4 << osb->osb_resv_level;
+ } else {
+ bits = 4 << osb->osb_dir_resv_level;
+ }
+ return bits;
+}
+
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_len)
+ return resv->r_start + resv->r_len - 1;
+ return resv->r_start;
+}
+
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+ return !!(resv->r_len == 0);
+}
+
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+ if (resmap->m_osb->osb_resv_level == 0)
+ return 1;
+ return 0;
+}
+
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct ocfs2_super *osb = resmap->m_osb;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+ int i = 0;
+
+ mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+ osb->dev_str, resmap->m_bitmap_len);
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+ "\tlast_len: %u\n", resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ node = rb_next(node);
+ i++;
+ }
+
+ mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+
+ i = 0;
+ list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+ mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+ "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ i++;
+ }
+}
+
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+ int i,
+ struct ocfs2_alloc_reservation *resv)
+{
+ char *disk_bitmap = resmap->m_disk_bitmap;
+ unsigned int start = resv->r_start;
+ unsigned int end = ocfs2_resv_end(resv);
+
+ while (start <= end) {
+ if (ocfs2_test_bit(start, disk_bitmap)) {
+ mlog(ML_ERROR,
+ "reservation %d covers an allocated area "
+ "starting at bit %u!\n", i, start);
+ return 1;
+ }
+
+ start++;
+ }
+ return 0;
+}
+
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+ unsigned int off = 0;
+ int i = 0;
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (i > 0 && resv->r_start <= off) {
+ mlog(ML_ERROR, "reservation %d has bad start off!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_len == 0) {
+ mlog(ML_ERROR, "reservation %d has no length!\n",
+ i);
+ goto bad;
+ }
+
+ if (resv->r_start > ocfs2_resv_end(resv)) {
+ mlog(ML_ERROR, "reservation %d has invalid range!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+ mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+ i);
+ goto bad;
+ }
+
+ if (ocfs2_validate_resmap_bits(resmap, i, resv))
+ goto bad;
+
+ off = ocfs2_resv_end(resv);
+ node = rb_next(node);
+
+ i++;
+ }
+ return;
+
+bad:
+ ocfs2_dump_resv(resmap);
+ BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+
+}
+#endif
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+ memset(resv, 0, sizeof(*resv));
+ INIT_LIST_HEAD(&resv->r_lru);
+}
+
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags)
+{
+ BUG_ON(flags & ~OCFS2_RESV_TYPES);
+
+ resv->r_flags |= flags;
+}
+
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap)
+{
+ memset(resmap, 0, sizeof(*resmap));
+
+ resmap->m_osb = osb;
+ resmap->m_reservations = RB_ROOT;
+ /* m_bitmap_len is initialized to zero by the above memset. */
+ INIT_LIST_HEAD(&resmap->m_lru);
+
+ return 0;
+}
+
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ if (!list_empty(&resv->r_lru))
+ list_del_init(&resv->r_lru);
+
+ list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+ resv->r_len = 0;
+ resv->r_start = 0;
+}
+
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+ list_del_init(&resv->r_lru);
+ rb_erase(&resv->r_node, &resmap->m_reservations);
+ resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+ }
+}
+
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ assert_spin_locked(&resv_lock);
+
+ __ocfs2_resv_trunc(resv);
+ /*
+ * last_len and last_start no longer make sense if
+ * we're changing the range of our allocations.
+ */
+ resv->r_last_len = resv->r_last_start = 0;
+
+ ocfs2_resv_remove(resmap, resv);
+}
+
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv)
+{
+ if (resv) {
+ spin_lock(&resv_lock);
+ __ocfs2_resv_discard(resmap, resv);
+ spin_unlock(&resv_lock);
+ }
+}
+
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+ struct rb_node *node;
+ struct ocfs2_alloc_reservation *resv;
+
+ assert_spin_locked(&resv_lock);
+
+ while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ __ocfs2_resv_discard(resmap, resv);
+ }
+}
+
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap)
+{
+ if (ocfs2_resmap_disabled(resmap))
+ return;
+
+ spin_lock(&resv_lock);
+
+ ocfs2_resmap_clear_all_resv(resmap);
+ resmap->m_bitmap_len = clen;
+ resmap->m_disk_bitmap = disk_bitmap;
+
+ spin_unlock(&resv_lock);
+}
+
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+ /* Does nothing for now. Keep this around for API symmetry */
+}
+
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *new)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ struct rb_node *parent = NULL;
+ struct rb_node **p = &root->rb_node;
+ struct ocfs2_alloc_reservation *tmp;
+
+ assert_spin_locked(&resv_lock);
+
+ trace_ocfs2_resv_insert(new->r_start, new->r_len);
+
+ while (*p) {
+ parent = *p;
+
+ tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+
+ if (new->r_start < tmp->r_start) {
+ p = &(*p)->rb_left;
+
+ /*
+ * This is a good place to check for
+ * overlapping reservations.
+ */
+ BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+ } else if (new->r_start > ocfs2_resv_end(tmp)) {
+ p = &(*p)->rb_right;
+ } else {
+ /* This should never happen! */
+ mlog(ML_ERROR, "Duplicate reservation window!\n");
+ BUG();
+ }
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, root);
+ new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+
+ ocfs2_resv_mark_lru(resmap, new);
+
+ ocfs2_check_resmap(resmap);
+}
+
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+ struct ocfs2_alloc_reservation *resv = NULL;
+ struct ocfs2_alloc_reservation *prev_resv = NULL;
+ struct rb_node *node = resmap->m_reservations.rb_node;
+
+ assert_spin_locked(&resv_lock);
+
+ if (!node)
+ return NULL;
+
+ node = rb_first(&resmap->m_reservations);
+ while (node) {
+ resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+
+ if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+ break;
+
+ /* Check if we overshot the reservation just before goal? */
+ if (resv->r_start > goal) {
+ resv = prev_resv;
+ break;
+ }
+
+ prev_resv = resv;
+ node = rb_next(node);
+ }
+
+ return resv;
+}
+
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+ unsigned int wanted,
+ unsigned int search_start,
+ unsigned int search_len,
+ unsigned int *rstart,
+ unsigned int *rlen)
+{
+ void *bitmap = resmap->m_disk_bitmap;
+ unsigned int best_start, best_len = 0;
+ int offset, start, found;
+
+ trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+ wanted, resmap->m_bitmap_len);
+
+ found = best_start = best_len = 0;
+
+ start = search_start;
+ while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+ start)) != -1) {
+ /* Search reached end of the region */
+ if (offset >= (search_start + search_len))
+ break;
+
+ if (offset == start) {
+ /* we found a zero */
+ found++;
+ /* move start to the next bit to test */
+ start++;
+ } else {
+ /* got a zero after some ones */
+ found = 1;
+ start = offset + 1;
+ }
+ if (found > best_len) {
+ best_len = found;
+ best_start = start - found;
+ }
+
+ if (found >= wanted)
+ break;
+ }
+
+ if (best_len == 0)
+ return 0;
+
+ if (best_len >= wanted)
+ best_len = wanted;
+
+ *rlen = best_len;
+ *rstart = best_start;
+
+ trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
+
+ return *rlen;
+}
+
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int goal, unsigned int wanted)
+{
+ struct rb_root *root = &resmap->m_reservations;
+ unsigned int gap_start, gap_end, gap_len;
+ struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+ struct rb_node *prev, *next;
+ unsigned int cstart, clen;
+ unsigned int best_start = 0, best_len = 0;
+
+ /*
+ * Nasty cases to consider:
+ *
+ * - rbtree is empty
+ * - our window should be first in all reservations
+ * - our window should be last in all reservations
+ * - need to make sure we don't go past end of bitmap
+ */
+ trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+ goal, wanted, RB_EMPTY_ROOT(root));
+
+ assert_spin_locked(&resv_lock);
+
+ if (RB_EMPTY_ROOT(root)) {
+ /*
+ * Easiest case - empty tree. We can just take
+ * whatever window of free bits we want.
+ */
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ resmap->m_bitmap_len - goal,
+ &cstart, &clen);
+
+ /*
+ * This should never happen - the local alloc window
+ * will always have free bits when we're called.
+ */
+ BUG_ON(goal == 0 && clen == 0);
+
+ if (clen == 0)
+ return;
+
+ resv->r_start = cstart;
+ resv->r_len = clen;
+
+ ocfs2_resv_insert(resmap, resv);
+ return;
+ }
+
+ prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+
+ if (prev_resv == NULL) {
+ /*
+ * A NULL here means that the search code couldn't
+ * find a window that starts before goal.
+ *
+ * However, we can take the first window after goal,
+ * which is also by definition, the leftmost window in
+ * the entire tree. If we can find free bits in the
+ * gap between goal and the LHS window, then the
+ * reservation can safely be placed there.
+ *
+ * Otherwise we fall back to a linear search, checking
+ * the gaps in between windows for a place to
+ * allocate.
+ */
+
+ next = rb_first(root);
+ next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+ r_node);
+
+ /*
+ * The search should never return such a window. (see
+ * comment above
+ */
+ if (next_resv->r_start <= goal) {
+ mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+ goal, next_resv->r_start, next_resv->r_len);
+ ocfs2_dump_resv(resmap);
+ BUG();
+ }
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+ next_resv->r_start - goal,
+ &cstart, &clen);
+ if (clen) {
+ best_len = clen;
+ best_start = cstart;
+ if (best_len == wanted)
+ goto out_insert;
+ }
+
+ prev_resv = next_resv;
+ next_resv = NULL;
+ }
+
+ trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+ ocfs2_resv_end(prev_resv));
+
+ prev = &prev_resv->r_node;
+
+ /* Now we do a linear search for a window, starting at 'prev_rsv' */
+ while (1) {
+ next = rb_next(prev);
+ if (next) {
+ next_resv = rb_entry(next,
+ struct ocfs2_alloc_reservation,
+ r_node);
+
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_end = next_resv->r_start - 1;
+ gap_len = gap_end - gap_start + 1;
+ } else {
+ /*
+ * We're at the rightmost edge of the
+ * tree. See if a reservation between this
+ * window and the end of the bitmap will work.
+ */
+ gap_start = ocfs2_resv_end(prev_resv) + 1;
+ gap_len = resmap->m_bitmap_len - gap_start;
+ gap_end = resmap->m_bitmap_len - 1;
+ }
+
+ trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+ next ? ocfs2_resv_end(next_resv) : -1);
+ /*
+ * No need to check this gap if we have already found
+ * a larger region of free bits.
+ */
+ if (gap_len <= best_len)
+ goto next_resv;
+
+ clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+ gap_len, &cstart, &clen);
+ if (clen == wanted) {
+ best_len = clen;
+ best_start = cstart;
+ goto out_insert;
+ } else if (clen > best_len) {
+ best_len = clen;
+ best_start = cstart;
+ }
+
+next_resv:
+ if (!next)
+ break;
+
+ prev = next;
+ prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+ r_node);
+ }
+
+out_insert:
+ if (best_len) {
+ resv->r_start = best_start;
+ resv->r_len = best_len;
+ ocfs2_resv_insert(resmap, resv);
+ }
+}
+
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ struct ocfs2_alloc_reservation *lru_resv;
+ int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+ unsigned int min_bits;
+
+ if (!tmpwindow)
+ min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+ else
+ min_bits = wanted; /* We at know the temp window will use all
+ * of these bits */
+
+ /*
+ * Take the first reservation off the LRU as our 'target'. We
+ * don't try to be smart about it. There might be a case for
+ * searching based on size but I don't have enough data to be
+ * sure. --Mark (3/16/2010)
+ */
+ lru_resv = list_first_entry(&resmap->m_lru,
+ struct ocfs2_alloc_reservation, r_lru);
+
+ trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+ lru_resv->r_len,
+ ocfs2_resv_end(lru_resv));
+
+ /*
+ * Cannibalize (some or all) of the target reservation and
+ * feed it to the current window.
+ */
+ if (lru_resv->r_len <= min_bits) {
+ /*
+ * Discard completely if size is less than or equal to a
+ * reasonable threshold - 50% of window bits for non temporary
+ * windows.
+ */
+ resv->r_start = lru_resv->r_start;
+ resv->r_len = lru_resv->r_len;
+
+ __ocfs2_resv_discard(resmap, lru_resv);
+ } else {
+ unsigned int shrink;
+ if (tmpwindow)
+ shrink = min_bits;
+ else
+ shrink = lru_resv->r_len / 2;
+
+ lru_resv->r_len -= shrink;
+
+ resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+ resv->r_len = shrink;
+ }
+
+ trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ ocfs2_resv_insert(resmap, resv);
+}
+
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int wanted)
+{
+ unsigned int goal = 0;
+
+ BUG_ON(!ocfs2_resv_empty(resv));
+
+ /*
+ * Begin by trying to get a window as close to the previous
+ * one as possible. Using the most recent allocation as a
+ * start goal makes sense.
+ */
+ if (resv->r_last_len) {
+ goal = resv->r_last_start + resv->r_last_len;
+ if (goal >= resmap->m_bitmap_len)
+ goal = 0;
+ }
+
+ __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+
+ /* Search from last alloc didn't work, try once more from beginning. */
+ if (ocfs2_resv_empty(resv) && goal != 0)
+ __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * Still empty? Pull oldest one off the LRU, remove it from
+ * tree, put this one in it's place.
+ */
+ ocfs2_cannibalize_resv(resmap, resv, wanted);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+}
+
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen)
+{
+ if (resv == NULL || ocfs2_resmap_disabled(resmap))
+ return -ENOSPC;
+
+ spin_lock(&resv_lock);
+
+ if (ocfs2_resv_empty(resv)) {
+ /*
+ * We don't want to over-allocate for temporary
+ * windows. Otherwise, we run the risk of fragmenting the
+ * allocation space.
+ */
+ unsigned int wanted = ocfs2_resv_window_bits(resmap, resv);
+
+ if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+ wanted = *clen;
+
+ /*
+ * Try to get a window here. If it works, we must fall
+ * through and test the bitmap . This avoids some
+ * ping-ponging of windows due to non-reserved space
+ * being allocation before we initialize a window for
+ * that inode.
+ */
+ ocfs2_resv_find_window(resmap, resv, wanted);
+ trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
+ }
+
+ BUG_ON(ocfs2_resv_empty(resv));
+
+ *cstart = resv->r_start;
+ *clen = resv->r_len;
+
+ spin_unlock(&resv_lock);
+ return 0;
+}
+
+static void
+ ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ unsigned int start, unsigned int end)
+{
+ unsigned int rhs = 0;
+ unsigned int old_end = ocfs2_resv_end(resv);
+
+ BUG_ON(start != resv->r_start || old_end < end);
+
+ /*
+ * Completely used? We can remove it then.
+ */
+ if (old_end == end) {
+ __ocfs2_resv_discard(resmap, resv);
+ return;
+ }
+
+ rhs = old_end - end;
+
+ /*
+ * This should have been trapped above.
+ */
+ BUG_ON(rhs == 0);
+
+ resv->r_start = end + 1;
+ resv->r_len = old_end - resv->r_start + 1;
+}
+
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen)
+{
+ unsigned int cend = cstart + clen - 1;
+
+ if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+ return;
+
+ if (resv == NULL)
+ return;
+
+ BUG_ON(cstart != resv->r_start);
+
+ spin_lock(&resv_lock);
+
+ trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+ ocfs2_resv_end(resv), resv->r_len,
+ resv->r_last_start,
+ resv->r_last_len);
+
+ BUG_ON(cstart < resv->r_start);
+ BUG_ON(cstart > ocfs2_resv_end(resv));
+ BUG_ON(cend > ocfs2_resv_end(resv));
+
+ ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+ resv->r_last_start = cstart;
+ resv->r_last_len = clen;
+
+ /*
+ * May have been discarded above from
+ * ocfs2_adjust_resv_from_alloc().
+ */
+ if (!ocfs2_resv_empty(resv))
+ ocfs2_resv_mark_lru(resmap, resv);
+
+ trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+ resv->r_len, resv->r_last_start,
+ resv->r_last_len);
+
+ ocfs2_check_resmap(resmap);
+
+ spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 00000000000..42c2b804f3f
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+
+#include <linux/rbtree.h>
+
+#define OCFS2_DEFAULT_RESV_LEVEL 2
+#define OCFS2_MAX_RESV_LEVEL 9
+#define OCFS2_MIN_RESV_LEVEL 0
+
+struct ocfs2_alloc_reservation {
+ struct rb_node r_node;
+
+ unsigned int r_start; /* Beginning of current window */
+ unsigned int r_len; /* Length of the window */
+
+ unsigned int r_last_len; /* Length of most recent alloc */
+ unsigned int r_last_start; /* Start of most recent alloc */
+ struct list_head r_lru; /* LRU list head */
+
+ unsigned int r_flags;
+};
+
+#define OCFS2_RESV_FLAG_INUSE 0x01 /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP 0x02 /* Temporary reservation, will be
+ * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR 0x04 /* Reservation is for an unindexed
+ * directory btree */
+
+struct ocfs2_reservation_map {
+ struct rb_root m_reservations;
+ char *m_disk_bitmap;
+
+ struct ocfs2_super *m_osb;
+
+ /* The following are not initialized to meaningful values until a disk
+ * bitmap is provided. */
+ u32 m_bitmap_len; /* Number of valid
+ * bits available */
+
+ struct list_head m_lru; /* LRU of reservations
+ * structures. */
+
+};
+
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+
+#define OCFS2_RESV_TYPES (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+ unsigned int flags);
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv);
+
+
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+ struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+ unsigned int clen, char *disk_bitmap);
+
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ int *cstart, int *clen);
+
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+ struct ocfs2_alloc_reservation *resv,
+ u32 cstart, u32 clen);
+
+#endif /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d2..d5da6f62414 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -27,7 +27,6 @@
#include <linux/fs.h>
#include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -39,6 +38,7 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
#include "suballoc.h"
@@ -53,8 +53,6 @@
*/
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
- int new_clusters,
- u32 first_new_cluster,
u16 cl_cpg,
int set)
{
@@ -82,7 +80,6 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
backups++;
}
- mlog_exit_void();
return backups;
}
@@ -103,8 +100,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
- mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
- new_clusters, first_new_cluster);
+ trace_ocfs2_update_last_group_and_inode(new_clusters,
+ first_new_cluster);
ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -128,17 +125,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
cl_cpg, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
- ret = ocfs2_journal_dirty(handle, group_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, group_bh);
/* update the inode accordingly. */
ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -162,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
spin_lock(&OCFS2_I(bm_inode)->ip_lock);
OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
i_size_write(bm_inode, le64_to_cpu(fe->i_size));
@@ -172,15 +163,14 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
cl_cpg, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
}
out:
- mlog_exit(ret);
+ if (ret)
+ mlog_errno(ret);
return ret;
}
@@ -285,8 +275,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
u32 first_new_cluster;
u64 lgd_blkno;
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
@@ -319,7 +307,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
- ocfs2_group_bitmap_size(osb->sb) * 8) {
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
"Force to do offline resize.");
ret = -EINVAL;
@@ -345,7 +334,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out_unlock;
}
- mlog(0, "extend the last group at %llu, new clusters = %d\n",
+
+ trace_ocfs2_group_extend(
(unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
@@ -380,7 +370,6 @@ out_mutex:
iput(main_bm_inode);
out:
- mlog_exit_void();
return ret;
}
@@ -474,8 +463,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *cr;
u16 cl_bpc;
-
- mlog_entry_void();
+ u64 bg_ptr;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
@@ -500,7 +488,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
- ocfs2_group_bitmap_size(osb->sb) * 8) {
+ ocfs2_group_bitmap_size(osb->sb, 0,
+ osb->s_feature_incompat) * 8) {
mlog(ML_ERROR, "The disk is too old and small."
" Force to do offline resize.");
ret = -EINVAL;
@@ -519,17 +508,17 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
mlog_errno(ret);
- goto out_unlock;
+ goto out_free_group_bh;
}
- mlog(0, "Add a new group %llu in chain = %u, length = %u\n",
- (unsigned long long)input->group, input->chain, input->clusters);
+ trace_ocfs2_group_add((unsigned long long)input->group,
+ input->chain, input->clusters, input->frees);
handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
- goto out_unlock;
+ goto out_free_group_bh;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
@@ -544,17 +533,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ bg_ptr = le64_to_cpu(group->bg_next_group);
group->bg_next_group = cr->c_blkno;
-
- ret = ocfs2_journal_dirty(handle, group_bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, group_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
+ group->bg_next_group = cpu_to_le64(bg_ptr);
mlog_errno(ret);
goto out_commit;
}
@@ -577,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
@@ -585,8 +571,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
out_commit:
ocfs2_commit_trans(osb, handle);
-out_unlock:
+
+out_free_group_bh:
brelse(group_bh);
+
+out_unlock:
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
@@ -596,6 +585,5 @@ out_mutex:
iput(main_bm_inode);
out:
- mlog_exit_void();
return ret;
}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949..1424c151ccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -27,7 +27,6 @@
#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -39,6 +38,7 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -142,8 +142,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
BUG_ON(si->si_blocks == 0);
BUG_ON(si->si_bh == NULL);
- mlog(0, "Refreshing slot map, reading %u block(s)\n",
- si->si_blocks);
+ trace_ocfs2_refresh_slot_info(si->si_blocks);
/*
* We pass -1 as blocknr because we expect all of si->si_bh to
@@ -357,7 +356,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
{
int status = 0;
u64 blkno;
- unsigned long long blocks, bytes;
+ unsigned long long blocks, bytes = 0;
unsigned int i;
struct buffer_head *bh;
@@ -381,8 +380,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
/* The size checks above should ensure this */
BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
- mlog(0, "Slot map needs %u buffers for %llu bytes\n",
- si->si_blocks, bytes);
+ trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
GFP_KERNEL);
@@ -400,8 +398,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
goto bail;
}
- mlog(0, "Reading slot map block %u at %llu\n", i,
- (unsigned long long)blkno);
+ trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
bh = NULL; /* Acquire a fresh bh */
status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
@@ -475,8 +472,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
int slot;
struct ocfs2_slot_info *si;
- mlog_entry_void();
-
si = osb->slot_info;
spin_lock(&osb->osb_lock);
@@ -498,21 +493,20 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
goto bail;
}
} else
- mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
- slot);
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
spin_unlock(&osb->osb_lock);
- mlog(0, "taking node slot %d\n", osb->slot_num);
+ trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
if (status < 0)
mlog_errno(status);
bail:
- mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ff..1724d43d3da 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
#include <linux/kernel.h>
#include <linux/crc32.h>
+#include <linux/slab.h>
#include <linux/module.h>
/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
@@ -27,6 +28,7 @@
#include "cluster/masklog.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
#include "stackglue.h"
@@ -255,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
}
/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+ u8 node_num;
+ int i;
+ unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+ node_num = o2nm_this_node();
+ if (node_num == O2NM_MAX_NODES) {
+ printk(KERN_ERR "o2cb: This node has not been configured.\n");
+ return -EINVAL;
+ }
+
+ /*
+ * o2dlm expects o2net sockets to be created. If not, then
+ * dlm_join_domain() fails with a stack of errors which are both cryptic
+ * and incomplete. The idea here is to detect upfront whether we have
+ * managed to connect to all nodes or not. If not, then list the nodes
+ * to allow the user to check the configuration (incorrect IP, firewall,
+ * etc.) Yes, this is racy. But its not the end of the world.
+ */
+#define O2CB_MAP_STABILIZE_COUNT 60
+ for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+ o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ if (!test_bit(node_num, hbmap)) {
+ printk(KERN_ERR "o2cb: %s heartbeat has not been "
+ "started.\n", (o2hb_global_heartbeat_active() ?
+ "Global" : "Local"));
+ return -EINVAL;
+ }
+ o2net_fill_node_map(netmap, sizeof(netmap));
+ /* Force set the current node to allow easy compare */
+ set_bit(node_num, netmap);
+ if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ return 0;
+ if (i < O2CB_MAP_STABILIZE_COUNT)
+ msleep(1000);
+ }
+
+ printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+ i = -1;
+ while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (!test_bit(i, netmap))
+ printk(" %u", i);
+ }
+ printk(".\n");
+
+ return -ENOTCONN;
+}
+
+/*
* Called from the dlm when it's about to evict a node. This is how the
* classic stack signals node death.
*/
@@ -262,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
{
struct ocfs2_cluster_connection *conn = data;
- mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
- node_num, conn->cc_namelen, conn->cc_name);
+ printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+ node_num, conn->cc_namelen, conn->cc_name);
conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
}
@@ -279,10 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL);
BUG_ON(conn->cc_proto == NULL);
- /* for now we only have one cluster/node, make sure we see it
- * in the heartbeat universe */
- if (!o2hb_check_local_node_heartbeating()) {
- rc = -EINVAL;
+ /* Ensure cluster stack is up and all nodes are connected */
+ rc = o2cb_cluster_check();
+ if (rc) {
+ printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+ "before retrying.\n");
goto out;
}
@@ -318,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
out_free:
- if (rc && conn->cc_private)
+ if (rc)
kfree(conn->cc_private);
out:
@@ -340,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
return 0;
}
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
int node_num;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b286..13a8537d8e8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,8 +21,9 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
-#include <linux/smp_lock.h>
+#include <linux/slab.h>
#include <linux/reboot.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include "stackglue.h"
@@ -102,6 +103,12 @@
#define OCFS2_TEXT_UUID_LEN 32
#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
/*
* ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
struct ocfs2_live_connection {
struct list_head oc_list;
struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
};
struct ocfs2_control_private {
@@ -190,7 +204,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
return c;
}
- return c;
+ return NULL;
}
/*
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
* mount path. Since the VFS prevents multiple calls to
* fill_super(), we can't get dupes here.
*/
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
- struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
{
int rc = 0;
- struct ocfs2_live_connection *c;
-
- c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
mutex_lock(&ocfs2_control_lock);
c->oc_conn = conn;
- if (atomic_read(&ocfs2_control_opened))
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
list_add(&c->oc_list, &ocfs2_live_connection_list);
else {
printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
}
mutex_unlock(&ocfs2_control_lock);
-
- if (!rc)
- *c_ret = c;
- else
- kfree(c);
-
return rc;
}
@@ -611,12 +614,10 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
- lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
- unlock_kernel();
return 0;
}
@@ -627,6 +628,7 @@ static const struct file_operations ocfs2_control_fops = {
.read = ocfs2_control_read,
.write = ocfs2_control_write,
.owner = THIS_MODULE,
+ .llseek = default_llseek,
};
static struct miscdevice ocfs2_control_device = {
@@ -800,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
return 0;
}
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, 2);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
{
dlm_lockspace_t *fsdlm;
- struct ocfs2_live_connection *uninitialized_var(control);
- int rc = 0;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
BUG_ON(conn == NULL);
- rc = ocfs2_live_connection_new(conn, &control);
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_FS, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+ if (rc)
+ goto out;
+
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
if (rc)
goto out;
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
/*
* running_proto must have been set before we allowed any mounts
* to proceed.
@@ -819,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
printk(KERN_ERR
"Unable to mount with fs locking protocol version "
- "%u.%u because the userspace control daemon has "
- "negotiated %u.%u\n",
+ "%u.%u because negotiated protocol is %u.%u\n",
conn->cc_version.pv_major, conn->cc_version.pv_minor,
running_proto.pv_major, running_proto.pv_minor);
rc = -EPROTO;
- ocfs2_live_connection_drop(control);
- goto out;
- }
-
- rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
- &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
- if (rc) {
- ocfs2_live_connection_drop(control);
- goto out;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
}
- conn->cc_private = control;
- conn->cc_lockspace = fsdlm;
out:
+ if (rc && lc)
+ kfree(lc);
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
- dlm_release_lockspace(conn->cc_lockspace, 2);
- conn->cc_lockspace = NULL;
- ocfs2_live_connection_drop(conn->cc_private);
- conn->cc_private = NULL;
- return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
{
int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
- rc = ocfs2_control_get_this_node();
if (rc < 0)
return rc;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 39abf89697e..5d965e83bd4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
EXPORT_SYMBOL_GPL(ocfs2_plock);
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,12 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- memcpy(new_conn->cc_name, group, grouplen);
+ strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
+ if (cluster_name_len)
+ strlcpy(new_conn->cc_cluster_name, cluster_name,
+ CLUSTER_NAME_MAX + 1);
+ new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
new_conn->cc_recovery_data = recovery_data;
@@ -386,8 +392,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
if (cluster_stack_name[0])
stack_name = cluster_stack_name;
- return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
- recovery_handler, recovery_data, conn);
+ return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+ lproto, recovery_handler, recovery_data,
+ conn);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
@@ -460,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
- return active_stack->sp_ops->this_node(node);
+ return active_stack->sp_ops->this_node(conn, node);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
@@ -488,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_max_locking_protocol =
- __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+ __ATTR(max_locking_protocol, S_IRUGO,
ocfs2_max_locking_protocol_show, NULL);
static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -520,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
- __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+ __ATTR(loaded_cluster_plugins, S_IRUGO,
ocfs2_loaded_cluster_plugins_show, NULL);
static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -542,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
- __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+ __ATTR(active_cluster_plugin, S_IRUGO,
ocfs2_active_cluster_plugin_show, NULL);
static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -591,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
static struct kobj_attribute ocfs2_attr_cluster_stack =
- __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+ __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
ocfs2_cluster_stack_show,
ocfs2_cluster_stack_store);
+
+
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "1\n");
+}
+
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+ __ATTR(dlm_recover_callback_support, S_IRUGO,
+ ocfs2_dlm_recover_show, NULL);
+
static struct attribute *ocfs2_attrs[] = {
&ocfs2_attr_max_locking_protocol.attr,
&ocfs2_attr_loaded_cluster_plugins.attr,
&ocfs2_attr_active_cluster_plugin.attr,
&ocfs2_attr_cluster_stack.attr,
+ &ocfs2_attr_dlm_recover_support.attr,
NULL,
};
@@ -643,7 +665,7 @@ error:
#define FS_OCFS2_NM 1
-static ctl_table ocfs2_nm_table[] = {
+static struct ctl_table ocfs2_nm_table[] = {
{
.procname = "hb_ctl_path",
.data = ocfs2_hb_ctl_path,
@@ -654,7 +676,7 @@ static ctl_table ocfs2_nm_table[] = {
{ }
};
-static ctl_table ocfs2_mod_table[] = {
+static struct ctl_table ocfs2_mod_table[] = {
{
.procname = "nm",
.data = NULL,
@@ -665,7 +687,7 @@ static ctl_table ocfs2_mod_table[] = {
{ }
};
-static ctl_table ocfs2_kern_table[] = {
+static struct ctl_table ocfs2_kern_table[] = {
{
.procname = "ocfs2",
.data = NULL,
@@ -676,7 +698,7 @@ static ctl_table ocfs2_kern_table[] = {
{ }
};
-static ctl_table ocfs2_root_table[] = {
+static struct ctl_table ocfs2_root_table[] = {
{
.procname = "fs",
.data = NULL,
@@ -687,7 +709,7 @@ static ctl_table ocfs2_root_table[] = {
{ }
};
-static struct ctl_table_header *ocfs2_table_header = NULL;
+static struct ctl_table_header *ocfs2_table_header;
/*
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 8ce7398ae1d..66334a30cea 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
*/
#define GROUP_NAME_MAX 64
+/* This shadows OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX 16
+
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
* locking compatibility.
*/
struct ocfs2_cluster_connection {
- char cc_name[GROUP_NAME_MAX];
+ char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
+ char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+ int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -126,7 +131,7 @@ struct ocfs2_stack_operations {
*
* ->connect() must not return until it is guaranteed that
*
- * - Node down notifications for the filesystem will be recieved
+ * - Node down notifications for the filesystem will be received
* and passed to conn->cc_recovery_handler().
* - Locking requests for the filesystem will be processed.
*/
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
- int (*this_node)(unsigned int *node);
+ int (*this_node)(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e07..0cb889a17ae 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -44,6 +43,7 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -53,6 +53,32 @@
#define OCFS2_MAX_TO_STEAL 1024
+struct ocfs2_suballoc_result {
+ u64 sr_bg_blkno; /* The bg we allocated from. Set
+ to 0 when a block group is
+ contiguous. */
+ u64 sr_bg_stable_blkno; /*
+ * Doesn't change, always
+ * set to target block
+ * group descriptor
+ * block.
+ */
+ u64 sr_blkno; /* The first allocated block */
+ unsigned int sr_bit_offset; /* The bit in the bg */
+ unsigned int sr_bits; /* How many bits we claimed */
+};
+
+static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
+{
+ if (res->sr_blkno == 0)
+ return 0;
+
+ if (res->sr_bg_blkno)
+ return res->sr_bg_blkno;
+
+ return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +86,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl);
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,35 +100,19 @@ static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+ struct ocfs2_suballoc_result *res);
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno);
+ struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
-
static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
@@ -137,6 +148,11 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
}
brelse(ac->ac_bh);
ac->ac_bh = NULL;
+ ac->ac_resv = NULL;
+ if (ac->ac_find_loc_priv) {
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
+ }
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -152,7 +168,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
#define do_error(fmt, ...) \
do{ \
- if (clean_error) \
+ if (resize) \
mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
else \
ocfs2_error(sb, fmt, ##__VA_ARGS__); \
@@ -160,7 +176,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
- int clean_error)
+ int resize)
{
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -211,7 +227,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_dinode *di,
struct buffer_head *bh,
- int clean_error)
+ int resize)
{
unsigned int max_bits;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +249,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
return -EINVAL;
}
- if (le16_to_cpu(gd->bg_chain) >=
- le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+ /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+ if ((le16_to_cpu(gd->bg_chain) >
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+ ((le16_to_cpu(gd->bg_chain) ==
+ le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
do_error("Group descriptor #%llu has bad chain %u",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
@@ -283,8 +302,8 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb,
int rc;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
- mlog(0, "Validating group descriptor %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_group_descriptor(
+ (unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -329,19 +348,41 @@ out:
return rc;
}
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_chain_list *cl,
+ u64 p_blkno, unsigned int clusters)
+{
+ struct ocfs2_extent_list *el = &bg->bg_list;
+ struct ocfs2_extent_rec *rec;
+
+ BUG_ON(!ocfs2_supports_discontig_bg(osb));
+ if (!el->l_next_free_rec)
+ el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+ rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+ rec->e_blkno = cpu_to_le64(p_blkno);
+ rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+ le16_to_cpu(cl->cl_bpc));
+ rec->e_leaf_clusters = cpu_to_le16(clusters);
+ le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&bg->bg_free_bits_count,
+ clusters * le16_to_cpu(cl->cl_bpc));
+ le16_add_cpu(&el->l_next_free_rec, 1);
+}
+
static int ocfs2_block_group_fill(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *bg_bh,
u64 group_blkno,
+ unsigned int group_clusters,
u16 my_chain,
struct ocfs2_chain_list *cl)
{
int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct super_block * sb = alloc_inode->i_sb;
- mlog_entry_void();
-
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
"b_blocknr (%llu)",
@@ -363,19 +404,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
memset(bg, 0, sb->s_blocksize);
strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
- bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
- bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
+ osb->s_feature_incompat));
bg->bg_chain = cpu_to_le16(my_chain);
bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
bg->bg_blkno = cpu_to_le64(group_blkno);
+ if (group_clusters == le16_to_cpu(cl->cl_cpg))
+ bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+ else
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+ group_clusters);
+
/* set the 1st bit in the bitmap to account for the descriptor block */
ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, bg_bh);
/* There is no need to zero out or otherwise initialize the
* other blocks in a group - All valid FS metadata in a block
@@ -383,7 +428,8 @@ static int ocfs2_block_group_fill(handle_t *handle,
* allocation time. */
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -401,6 +447,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
return best;
}
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ struct buffer_head *bg_bh;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+
+ status = ocfs2_claim_clusters(handle, ac,
+ le16_to_cpu(cl->cl_cpg), &bit_off,
+ &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_block_group_alloc_contig(
+ (unsigned long long)bg_blkno, alloc_rec);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
+ brelse(bg_bh);
+ mlog_errno(status);
+ }
+
+bail:
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ unsigned int min_bits,
+ u32 *bit_off, u32 *num_bits)
+{
+ int status = 0;
+
+ while (min_bits) {
+ status = ocfs2_claim_clusters(handle, ac, min_bits,
+ bit_off, num_bits);
+ if (status != -ENOSPC)
+ break;
+
+ min_bits >>= 1;
+ }
+
+ return status;
+}
+
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl,
+ unsigned int min_bits)
+{
+ int status;
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+ struct ocfs2_group_desc *bg =
+ (struct ocfs2_group_desc *)bg_bh->b_data;
+ unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ u32 p_cpos, clusters;
+ u64 p_blkno;
+ struct ocfs2_extent_list *el = &bg->bg_list;
+
+ status = ocfs2_journal_access_gd(handle,
+ INODE_CACHE(alloc_inode),
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+ le16_to_cpu(el->l_count))) {
+ if (min_bits > needed)
+ min_bits = needed;
+ status = ocfs2_block_group_claim_bits(osb, handle, ac,
+ min_bits, &p_cpos,
+ &clusters);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+ ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+ clusters);
+
+ min_bits = clusters;
+ needed = le16_to_cpu(cl->cl_cpg) -
+ le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+ }
+
+ if (needed > 0) {
+ /*
+ * We have used up all the extent rec but can't fill up
+ * the cpg. So bail out.
+ */
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ ocfs2_journal_dirty(handle, bg_bh);
+
+bail:
+ return status;
+}
+
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+ struct ocfs2_alloc_context *cluster_ac,
+ struct inode *alloc_inode,
+ struct buffer_head *bg_bh)
+{
+ int i, ret;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ if (!bg_bh)
+ return;
+
+ bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+ el = &bg->bg_list;
+ for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+ ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+ cluster_ac->ac_bh,
+ le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(rec->e_leaf_clusters));
+ if (ret)
+ mlog_errno(ret);
+ /* Try all the clusters to free */
+ }
+
+ ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+ brelse(bg_bh);
+}
+
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_alloc_context *ac,
+ struct ocfs2_chain_list *cl)
+{
+ int status;
+ u32 bit_off, num_bits;
+ u64 bg_blkno;
+ unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+ struct buffer_head *bg_bh = NULL;
+ unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+ struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+
+ if (!ocfs2_supports_discontig_bg(osb)) {
+ status = -ENOSPC;
+ goto bail;
+ }
+
+ status = ocfs2_extend_trans(handle,
+ ocfs2_calc_bg_discontig_credits(osb->sb));
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /*
+ * We're going to be grabbing from multiple cluster groups.
+ * We don't have enough credits to relink them all, and the
+ * cluster groups will be staying in cache for the duration of
+ * this operation.
+ */
+ ac->ac_disable_chain_relink = 1;
+
+ /* Claim the first region */
+ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+ &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+ min_bits = num_bits;
+
+ /* setup the group */
+ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ trace_ocfs2_block_group_alloc_discontig(
+ (unsigned long long)bg_blkno, alloc_rec);
+
+ bg_bh = sb_getblk(osb->sb, bg_blkno);
+ if (!bg_bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+
+ status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+ bg_blkno, num_bits, alloc_rec, cl);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+ bg_bh, ac, cl, min_bits);
+ if (status)
+ mlog_errno(status);
+
+bail:
+ if (status)
+ ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+ return status ? ERR_PTR(status) : bg_bh;
+}
+
/*
* We expect the block group allocator to already be locked.
*/
@@ -416,16 +694,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct ocfs2_chain_list *cl;
struct ocfs2_alloc_context *ac = NULL;
handle_t *handle = NULL;
- u32 bit_off, num_bits;
u16 alloc_rec;
- u64 bg_blkno;
struct buffer_head *bg_bh = NULL;
struct ocfs2_group_desc *bg;
BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
- mlog_entry_void();
-
cl = &fe->id2.i_chain;
status = ocfs2_reserve_clusters_with_limit(osb,
le16_to_cpu(cl->cl_cpg),
@@ -447,48 +721,24 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
}
if (last_alloc_group && *last_alloc_group != 0) {
- mlog(0, "use old allocation group %llu for block group alloc\n",
- (unsigned long long)*last_alloc_group);
+ trace_ocfs2_block_group_alloc(
+ (unsigned long long)*last_alloc_group);
ac->ac_last_group = *last_alloc_group;
}
- status = ocfs2_claim_clusters(osb,
- handle,
- ac,
- le16_to_cpu(cl->cl_cpg),
- &bit_off,
- &num_bits);
- if (status < 0) {
+
+ bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
+ bg_bh = ocfs2_block_group_alloc_discontig(handle,
+ alloc_inode,
+ ac, cl);
+ if (IS_ERR(bg_bh)) {
+ status = PTR_ERR(bg_bh);
+ bg_bh = NULL;
if (status != -ENOSPC)
mlog_errno(status);
goto bail;
}
-
- alloc_rec = ocfs2_find_smallest_chain(cl);
-
- /* setup the group */
- bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "new descriptor, record %u, at block %llu\n",
- alloc_rec, (unsigned long long)bg_blkno);
-
- bg_bh = sb_getblk(osb->sb, bg_blkno);
- if (!bg_bh) {
- status = -EIO;
- mlog_errno(status);
- goto bail;
- }
- ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-
- status = ocfs2_block_group_fill(handle,
- alloc_inode,
- bg_bh,
- bg_blkno,
- alloc_rec,
- cl);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -498,10 +748,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
goto bail;
}
+ alloc_rec = le16_to_cpu(bg->bg_chain);
le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
le16_to_cpu(bg->bg_free_bits_count));
- le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
- cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
+ le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
+ le16_to_cpu(bg->bg_bits));
+ cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -510,11 +762,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
- status = ocfs2_journal_dirty(handle, bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -523,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+ ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
status = 0;
@@ -539,7 +788,8 @@ bail:
brelse(bg_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -557,8 +807,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
struct ocfs2_dinode *fe;
u32 free_bits;
- mlog_entry_void();
-
alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
if (!alloc_inode) {
mlog_errno(-EINVAL);
@@ -598,16 +846,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
if (bits_wanted > free_bits) {
/* cluster bitmap never grows */
if (ocfs2_is_cluster_bitmap(alloc_inode)) {
- mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
- bits_wanted, free_bits);
+ trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+ free_bits);
status = -ENOSPC;
goto bail;
}
if (!(flags & ALLOC_NEW_GROUP)) {
- mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
- "and we don't alloc a new group for it.\n",
- slot, bits_wanted, free_bits);
+ trace_ocfs2_reserve_suballoc_bits_no_new_group(
+ slot, bits_wanted, free_bits);
status = -ENOSPC;
goto bail;
}
@@ -633,7 +880,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
bail:
brelse(bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -764,7 +1012,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
status = ocfs2_reserve_suballoc_bits(osb, (*ac),
EXTENT_ALLOC_SYSTEM_INODE,
(u32)osb->slot_num, NULL,
- ALLOC_NEW_GROUP);
+ ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
if (status >= 0) {
@@ -795,7 +1043,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -862,8 +1111,8 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
spin_lock(&osb->osb_lock);
osb->osb_inode_alloc_group = alloc_group;
spin_unlock(&osb->osb_lock);
- mlog(0, "after reservation, new allocation group is "
- "%llu\n", (unsigned long long)alloc_group);
+ trace_ocfs2_reserve_new_inode_new_group(
+ (unsigned long long)alloc_group);
/*
* Some inodes must be freed by us, so try to allocate
@@ -895,7 +1144,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -932,8 +1182,6 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
{
int status;
- mlog_entry_void();
-
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
status = -ENOMEM;
@@ -950,11 +1198,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
status = ocfs2_reserve_local_alloc_bits(osb,
bits_wanted,
*ac);
- if (status == -EFBIG) {
- /* The local alloc window is outside ac_max_block.
- * use the main bitmap. */
- status = -ENOSPC;
- } else if ((status < 0) && (status != -ENOSPC)) {
+ if ((status < 0) && (status != -ENOSPC)) {
mlog_errno(status);
goto bail;
}
@@ -976,7 +1220,8 @@ bail:
*ac = NULL;
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1037,8 +1282,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
struct buffer_head *bg_bh,
unsigned int bits_wanted,
unsigned int total_bits,
- u16 *bit_off,
- u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
void *bitmap;
u16 best_offset, best_size;
@@ -1082,14 +1326,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
}
}
- /* XXX: I think the first clause is equivalent to the second
- * - jlbec */
- if (found == bits_wanted) {
- *bit_off = start - found;
- *bits_found = found;
- } else if (best_size) {
- *bit_off = best_offset;
- *bits_found = best_size;
+ if (best_size) {
+ res->sr_bit_offset = best_offset;
+ res->sr_bits = best_size;
} else {
status = -ENOSPC;
/* No error log here -- see the comment above
@@ -1099,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
return status;
}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
@@ -1110,15 +1349,12 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
void *bitmap = bg->bg_bitmap;
int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
- mlog_entry_void();
-
/* All callers get the descriptor via
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
- mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
- num_bits);
+ trace_ocfs2_block_group_set_bits(bit_off, num_bits);
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
@@ -1133,19 +1369,20 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
-
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
- status = ocfs2_journal_dirty(handle,
- group_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, group_bh);
bail:
- mlog_exit(status);
return status;
}
@@ -1178,7 +1415,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
int status;
/* there is a really tiny chance the journal calls could fail,
* but we wouldn't want inconsistent blocks in *any* case. */
- u64 fe_ptr, bg_ptr, prev_bg_ptr;
+ u64 bg_ptr, prev_bg_ptr;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1188,71 +1425,49 @@ static int ocfs2_relink_block_group(handle_t *handle,
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
- mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+ trace_ocfs2_relink_block_group(
+ (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
- fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
prev_bg_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out;
prev_bg->bg_next_group = bg->bg_next_group;
-
- status = ocfs2_journal_dirty(handle, prev_bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, prev_bg_bh);
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out_rollback_prev_bg;
bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
-
- status = ocfs2_journal_dirty(handle, bg_bh);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ ocfs2_journal_dirty(handle, bg_bh);
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto out_rollback;
- }
+ if (status < 0)
+ goto out_rollback_bg;
fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+ ocfs2_journal_dirty(handle, fe_bh);
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
+out:
+ if (status < 0)
mlog_errno(status);
- goto out_rollback;
- }
-
- status = 0;
-out_rollback:
- if (status < 0) {
- fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
- bg->bg_next_group = cpu_to_le64(bg_ptr);
- prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
- }
-
- mlog_exit(status);
return status;
+
+out_rollback_bg:
+ bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+ prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+ goto out;
}
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
@@ -1267,14 +1482,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
int search = -ENOSPC;
int ret;
u64 blkoff;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- u16 tmp_off, tmp_found;
unsigned int max_bits, gd_cluster_off;
BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1283,7 +1497,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
max_bits = le16_to_cpu(gd->bg_bits);
/* Tail groups in cluster bitmaps which aren't cpg
- * aligned are prone to partial extention by a failed
+ * aligned are prone to partial extension by a failed
* fs resize. If the file system resize never got to
* update the dinode cluster count, then we don't want
* to trust any clusters past it, regardless of what
@@ -1293,26 +1507,26 @@ static int ocfs2_cluster_group_search(struct inode *inode,
if ((gd_cluster_off + max_bits) >
OCFS2_I(inode)->ip_clusters) {
max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
- mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- OCFS2_I(inode)->ip_clusters, max_bits);
+ trace_ocfs2_cluster_group_search_wrong_max_bits(
+ (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ le16_to_cpu(gd->bg_bits),
+ OCFS2_I(inode)->ip_clusters, max_bits);
}
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
- max_bits,
- &tmp_off, &tmp_found);
+ max_bits, res);
if (ret)
return ret;
if (max_block) {
blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
gd_cluster_off +
- tmp_off + tmp_found);
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)blkoff,
- (unsigned long long)max_block);
+ res->sr_bit_offset +
+ res->sr_bits);
+ trace_ocfs2_cluster_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
if (blkoff > max_block)
return -ENOSPC;
}
@@ -1321,16 +1535,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
* return success, but we still want to return
* -ENOSPC unless it found the minimum number
* of bits. */
- if (min_bits <= tmp_found) {
- *bit_off = tmp_off;
- *bits_found = tmp_found;
+ if (min_bits <= res->sr_bits)
search = 0; /* success */
- } else if (tmp_found) {
+ else if (res->sr_bits) {
/*
* Don't show bits which we'll be returning
* for allocation to the local alloc bitmap.
*/
- ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+ ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
}
}
@@ -1341,7 +1553,7 @@ static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
u64 max_block,
- u16 *bit_off, u16 *bits_found)
+ struct ocfs2_suballoc_result *res)
{
int ret = -ENOSPC;
u64 blkoff;
@@ -1354,13 +1566,13 @@ static int ocfs2_block_group_search(struct inode *inode,
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
- bit_off, bits_found);
+ res);
if (!ret && max_block) {
- blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
- *bits_found;
- mlog(0, "Checking %llu against %llu\n",
- (unsigned long long)blkoff,
- (unsigned long long)max_block);
+ blkoff = le64_to_cpu(bg->bg_blkno) +
+ res->sr_bit_offset + res->sr_bits;
+ trace_ocfs2_block_group_search_max_block(
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
if (blkoff > max_block)
ret = -ENOSPC;
}
@@ -1369,7 +1581,7 @@ static int ocfs2_block_group_search(struct inode *inode,
return ret;
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
@@ -1390,33 +1602,91 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out:
return ret;
}
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain)
+{
+ u32 tmp_used;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ struct ocfs2_chain_list *cl;
+
+ cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+ tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+ di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
+
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_chain_list *cl)
+{
+ unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+ unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+ unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
+
+ if (res->sr_bit_offset < bitoff)
+ return 0;
+ if (res->sr_bit_offset >= (bitoff + bitcount))
+ return 0;
+ res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+ (res->sr_bit_offset - bitoff);
+ if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+ res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+ return 1;
+}
+
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+ struct ocfs2_group_desc *bg,
+ struct ocfs2_suballoc_result *res)
+{
+ int i;
+ u64 bg_blkno = res->sr_bg_blkno; /* Save off */
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+ struct ocfs2_chain_list *cl = &di->id2.i_chain;
+
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+ res->sr_blkno = 0;
+ return;
+ }
+
+ res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+ res->sr_bg_blkno = 0; /* Clear it for contig block groups */
+ if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+ !bg->bg_list.l_next_free_rec)
+ return;
+
+ for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+ rec = &bg->bg_list.l_recs[i];
+ if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+ res->sr_bg_blkno = bg_blkno; /* Restore */
+ break;
+ }
+ }
+}
+
static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 gd_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int ret;
- u16 found;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *gd;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
- &group_bh);
+ ret = ocfs2_read_group_descriptor(alloc_inode, di,
+ res->sr_bg_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1424,17 +1694,27 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
gd = (struct ocfs2_group_desc *) group_bh->b_data;
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
- ac->ac_max_block, bit_off, &found);
+ ac->ac_max_block, res);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
goto out;
}
- *num_bits = found;
+ if (!ret)
+ ocfs2_bg_discontig_fix_result(ac, gd, res);
+
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
+
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
- *num_bits,
+ res->sr_bits,
le16_to_cpu(gd->bg_chain));
if (ret < 0) {
mlog_errno(ret);
@@ -1442,10 +1722,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
- *bit_off, *num_bits);
- if (ret < 0)
+ res->sr_bit_offset, res->sr_bits);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+ res->sr_bits,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
+out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
out:
@@ -1458,14 +1743,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno,
+ struct ocfs2_suballoc_result *res,
u16 *bits_left)
{
int status;
- u16 chain, tmp_bits;
- u32 tmp_used;
+ u16 chain;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
@@ -1475,9 +1757,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
struct ocfs2_group_desc *bg;
chain = ac->ac_chain;
- mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
- bits_wanted, chain,
- (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
+ trace_ocfs2_search_chain_begin(
+ (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+ bits_wanted, chain);
status = ocfs2_read_group_descriptor(alloc_inode, fe,
le64_to_cpu(cl->cl_recs[chain].c_blkno),
@@ -1493,8 +1775,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* the 1st group with any empty bits. */
while ((status = ac->ac_group_search(alloc_inode, group_bh,
bits_wanted, min_bits,
- ac->ac_max_block, bit_off,
- &tmp_bits)) == -ENOSPC) {
+ ac->ac_max_block,
+ res)) == -ENOSPC) {
if (!bg->bg_next_group)
break;
@@ -1518,12 +1800,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
goto bail;
}
- mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
- tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+ trace_ocfs2_search_chain_succ(
+ (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
+
+ res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
- *num_bits = tmp_bits;
+ BUG_ON(res->sr_bits == 0);
+ if (!status)
+ ocfs2_bg_discontig_fix_result(ac, bg, res);
- BUG_ON(*num_bits == 0);
+ /*
+ * sr_bg_blkno might have been changed by
+ * ocfs2_bg_discontig_fix_result
+ */
+ res->sr_bg_stable_blkno = group_bh->b_blocknr;
/*
* Keep track of previous block descriptor read. When
@@ -1538,9 +1828,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
* Do this *after* figuring out how many bits we're taking out
* of our target group.
*/
- if (ac->ac_allow_chain_relink &&
+ if (!ac->ac_disable_chain_relink &&
(prev_group_bh) &&
- (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+ (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
status = ocfs2_relink_block_group(handle, alloc_inode,
ac->ac_bh, group_bh,
prev_group_bh, chain);
@@ -1550,24 +1840,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
}
}
- /* Ok, claim our bits now: set the info on dinode, chainlist
- * and then the group */
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(alloc_inode),
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
- fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+ if (ac->ac_find_loc_only)
+ goto out_loc_only;
- status = ocfs2_journal_dirty(handle,
- ac->ac_bh);
- if (status < 0) {
+ status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1576,45 +1855,44 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
alloc_inode,
bg,
group_bh,
- *bit_off,
- *num_bits);
+ res->sr_bit_offset,
+ res->sr_bits);
if (status < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(status);
goto bail;
}
- mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
- (unsigned long long)le64_to_cpu(fe->i_blkno));
+ trace_ocfs2_search_chain_end(
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ res->sr_bits);
- *bg_blkno = le64_to_cpu(bg->bg_blkno);
+out_loc_only:
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
brelse(group_bh);
brelse(prev_group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
/* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
- struct ocfs2_alloc_context *ac,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
handle_t *handle,
u32 bits_wanted,
u32 min_bits,
- u16 *bit_off,
- unsigned int *num_bits,
- u64 *bg_blkno)
+ struct ocfs2_suballoc_result *res)
{
int status;
u16 victim, i;
u16 bits_left = 0;
- u64 hint_blkno = ac->ac_last_group;
+ u64 hint = ac->ac_last_group;
struct ocfs2_chain_list *cl;
struct ocfs2_dinode *fe;
- mlog_entry_void();
-
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
BUG_ON(!ac->ac_bh);
@@ -1627,7 +1905,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+ ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used "
"bits but only %u total.",
(unsigned long long)le64_to_cpu(fe->i_blkno),
le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1636,22 +1915,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
- if (hint_blkno) {
+ res->sr_bg_blkno = hint;
+ if (res->sr_bg_blkno) {
/* Attempt to short-circuit the usual search mechanism
* by jumping straight to the most recently used
- * allocation group. This helps us mantain some
+ * allocation group. This helps us maintain some
* contiguousness across allocations. */
status = ocfs2_search_one_group(ac, handle, bits_wanted,
- min_bits, bit_off, num_bits,
- hint_blkno, &bits_left);
- if (!status) {
- /* Be careful to update *bg_blkno here as the
- * caller is expecting it to be filled in, and
- * ocfs2_search_one_group() won't do that for
- * us. */
- *bg_blkno = hint_blkno;
+ min_bits, res, &bits_left);
+ if (!status)
goto set_hint;
- }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1662,25 +1935,25 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
- ac->ac_allow_chain_relink = 1;
- status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
- num_bits, bg_blkno, &bits_left);
- if (!status)
+ status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
+ res, &bits_left);
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
goto set_hint;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
}
- mlog(0, "Search of victim chain %u came up with nothing, "
- "trying all chains now.\n", victim);
+ trace_ocfs2_claim_suballoc_bits(victim);
/* If we didn't pick a good victim, then just default to
* searching each chain in order. Don't allow chain relinking
* because we only calculate enough journal credits for one
* relink per alloc. */
- ac->ac_allow_chain_relink = 0;
+ ac->ac_disable_chain_relink = 1;
for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
if (i == victim)
continue;
@@ -1689,10 +1962,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
ac->ac_chain = i;
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
- bit_off, num_bits, bg_blkno,
- &bits_left);
- if (!status)
+ res, &bits_left);
+ if (!status) {
+ hint = ocfs2_group_from_res(res);
break;
+ }
if (status < 0 && status != -ENOSPC) {
mlog_errno(status);
goto bail;
@@ -1707,56 +1981,58 @@ set_hint:
if (bits_left < min_bits)
ac->ac_last_group = 0;
else
- ac->ac_last_group = *bg_blkno;
+ ac->ac_last_group = hint;
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
unsigned int *num_bits,
u64 *blkno_start)
{
int status;
- u64 bg_blkno;
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
BUG_ON(!ac);
BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
bits_wanted,
1,
- suballoc_bit_start,
- num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
- ac->ac_bits_given += (*num_bits);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit_start = res.sr_bit_offset;
+ *blkno_start = res.sr_blkno;
+ ac->ac_bits_given += res.sr_bits;
+ *num_bits = res.sr_bits;
status = 0;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
static void ocfs2_init_inode_ac_group(struct inode *dir,
- struct buffer_head *parent_fe_bh,
+ struct buffer_head *parent_di_bh,
struct ocfs2_alloc_context *ac)
{
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
/*
* Try to allocate inodes from some specific group.
*
@@ -1770,10 +2046,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
if (OCFS2_I(dir)->ip_last_used_group &&
OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
- else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
- ac->ac_last_group = ocfs2_which_suballoc_group(
- le64_to_cpu(fe->i_blkno),
- le16_to_cpu(fe->i_suballoc_bit));
+ else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
+ if (di->i_suballoc_loc)
+ ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
+ else
+ ac->ac_last_group = ocfs2_which_suballoc_group(
+ le64_to_cpu(di->i_blkno),
+ le16_to_cpu(di->i_suballoc_bit));
+ }
}
static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1783,19 +2063,148 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
}
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno)
+{
+ int ret;
+ handle_t *handle = NULL;
+ struct ocfs2_suballoc_result *res;
+
+ BUG_ON(!ac);
+ BUG_ON(ac->ac_bits_given != 0);
+ BUG_ON(ac->ac_bits_wanted != 1);
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
+
+ res = kzalloc(sizeof(*res), GFP_NOFS);
+ if (res == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
+ /*
+ * The handle started here is for chain relink. Alternatively,
+ * we could just disable relink for these calls.
+ */
+ handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * This will instruct ocfs2_claim_suballoc_bits and
+ * ocfs2_search_one_group to search but save actual allocation
+ * for later.
+ */
+ ac->ac_find_loc_only = 1;
+
+ ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ac->ac_find_loc_priv = res;
+ *fe_blkno = res->sr_blkno;
+ ocfs2_update_inode_fsync_trans(handle, dir, 0);
+out:
+ if (handle)
+ ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
+
+ if (ret)
+ kfree(res);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno)
+{
+ int ret;
+ u16 chain;
+ struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
+ struct buffer_head *bg_bh = NULL;
+ struct ocfs2_group_desc *bg;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
+
+ /*
+ * Since di_blkno is being passed back in, we check for any
+ * inconsistencies which may have happened between
+ * calls. These are code bugs as di_blkno is not expected to
+ * change once returned from ocfs2_find_new_inode_loc()
+ */
+ BUG_ON(res->sr_blkno != di_blkno);
+
+ ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
+ res->sr_bg_stable_blkno, &bg_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ chain = le16_to_cpu(bg->bg_chain);
+
+ ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
+ ac->ac_bh, res->sr_bits,
+ chain);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_block_group_set_bits(handle,
+ ac->ac_inode,
+ bg,
+ bg_bh,
+ res->sr_bit_offset,
+ res->sr_bits);
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+ ac->ac_bh, res->sr_bits, chain);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+ res->sr_bits);
+
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
+
+ BUG_ON(res->sr_bits != 1);
+
+ *suballoc_loc = res->sr_bg_blkno;
+ *suballoc_bit = res->sr_bit_offset;
+ ac->ac_bits_given++;
+ ocfs2_save_inode_ac_group(dir, ac);
+
+out:
+ brelse(bg_bh);
+
+ return ret;
+}
+
+int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno)
{
int status;
- unsigned int num_bits;
- u64 bg_blkno;
-
- mlog_entry_void();
+ struct ocfs2_suballoc_result res;
BUG_ON(!ac);
BUG_ON(ac->ac_bits_given != 0);
@@ -1804,28 +2213,28 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
1,
1,
- suballoc_bit,
- &num_bits,
- &bg_blkno);
+ &res);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- atomic_inc(&osb->alloc_stats.bg_allocs);
+ atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
- BUG_ON(num_bits != 1);
+ BUG_ON(res.sr_bits != 1);
- *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+ *suballoc_loc = res.sr_bg_blkno;
+ *suballoc_bit = res.sr_bit_offset;
+ *fe_blkno = res.sr_blkno;
ac->ac_bits_given++;
ocfs2_save_inode_ac_group(dir, ac);
status = 0;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1890,8 +2299,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
* contig. allocation, set to '1' to indicate we can deal with extents
* of any size.
*/
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
@@ -1900,10 +2308,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
{
int status;
unsigned int bits_wanted = max_clusters;
- u64 bg_blkno = 0;
- u16 bg_bit_off;
-
- mlog_entry_void();
+ struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
+ struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
@@ -1911,6 +2317,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
&& ac->ac_which != OCFS2_AC_USE_MAIN);
if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+ WARN_ON(min_clusters > 1);
+
status = ocfs2_claim_local_alloc_bits(osb,
handle,
ac,
@@ -1933,20 +2341,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
if (bits_wanted > (osb->bitmap_cpg - 1))
bits_wanted = osb->bitmap_cpg - 1;
- status = ocfs2_claim_suballoc_bits(osb,
- ac,
+ status = ocfs2_claim_suballoc_bits(ac,
handle,
bits_wanted,
min_clusters,
- &bg_bit_off,
- num_clusters,
- &bg_blkno);
+ &res);
if (!status) {
+ BUG_ON(res.sr_blkno); /* cluster alloc can't set */
*cluster_start =
ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
- bg_blkno,
- bg_bit_off);
+ res.sr_bg_blkno,
+ res.sr_bit_offset);
atomic_inc(&osb->alloc_stats.bitmap_data);
+ *num_clusters = res.sr_bits;
}
}
if (status < 0) {
@@ -1958,12 +2365,12 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
ac->ac_bits_given += *num_clusters;
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
@@ -1971,45 +2378,41 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
{
unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
- return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+ return __ocfs2_claim_clusters(handle, ac, min_clusters,
bits_wanted, cluster_start, num_clusters);
}
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits)
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bmap))
{
int status;
unsigned int tmp;
- int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
struct ocfs2_group_desc *undo_bg = NULL;
- int cluster_bitmap = 0;
-
- mlog_entry_void();
/* The caller got this descriptor from
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
- mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
-
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+ trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
+ BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
- group_bh, journal_type);
+ group_bh,
+ undo_fn ?
+ OCFS2_JOURNAL_ACCESS_UNDO :
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- cluster_bitmap = 1;
-
- if (cluster_bitmap) {
+ if (undo_fn) {
jbd_lock_bh_state(group_bh);
undo_bg = (struct ocfs2_group_desc *)
bh2jh(group_bh)->b_committed_data;
@@ -2020,18 +2423,24 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
while(tmp--) {
ocfs2_clear_bit((bit_off + tmp),
(unsigned long *) bg->bg_bitmap);
- if (cluster_bitmap)
- ocfs2_set_bit(bit_off + tmp,
- (unsigned long *) undo_bg->bg_bitmap);
+ if (undo_fn)
+ undo_fn(bit_off + tmp,
+ (unsigned long *) undo_bg->bg_bitmap);
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
- if (cluster_bitmap)
+ if (undo_fn)
jbd_unlock_bh_state(group_bh);
- status = ocfs2_journal_dirty(handle, group_bh);
- if (status < 0)
- mlog_errno(status);
+ ocfs2_journal_dirty(handle, group_bh);
bail:
return status;
}
@@ -2039,12 +2448,14 @@ bail:
/*
* expects the suballoc inode to already be locked.
*/
-int ocfs2_free_suballoc_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct buffer_head *alloc_bh,
- unsigned int start_bit,
- u64 bg_blkno,
- unsigned int count)
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bitmap))
{
int status = 0;
u32 tmp_used;
@@ -2053,19 +2464,18 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *group;
- mlog_entry_void();
-
/* The alloc_bh comes from ocfs2_free_dinode() or
* ocfs2_free_clusters(). The callers have all locked the
* allocator and gotten alloc_bh from the lock call. This
- * validates the dinode buffer. Any corruption that has happended
+ * validates the dinode buffer. Any corruption that has happened
* is a code bug. */
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
- mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
- (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
- (unsigned long long)bg_blkno, start_bit);
+ trace_ocfs2_free_suballoc_bits(
+ (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+ (unsigned long long)bg_blkno,
+ start_bit, count);
status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
&group_bh);
@@ -2079,7 +2489,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
group, group_bh,
- start_bit, count);
+ start_bit, count, undo_fn);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2096,20 +2506,27 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
count);
tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
-
- status = ocfs2_journal_dirty(handle, alloc_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_journal_dirty(handle, alloc_bh);
bail:
brelse(group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count)
+{
+ return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+ start_bit, bg_blkno, count, NULL);
+}
+
int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
@@ -2119,15 +2536,19 @@ int ocfs2_free_dinode(handle_t *handle,
u16 bit = le16_to_cpu(di->i_suballoc_bit);
u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (di->i_suballoc_loc)
+ bg_blkno = le64_to_cpu(di->i_suballoc_loc);
return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
inode_alloc_bh, bit, bg_blkno, 1);
}
-int ocfs2_free_clusters(handle_t *handle,
- struct inode *bitmap_inode,
- struct buffer_head *bitmap_bh,
- u64 start_blk,
- unsigned int num_clusters)
+static int _ocfs2_free_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters,
+ void (*undo_fn)(unsigned int bit,
+ unsigned long *bitmap))
{
int status;
u16 bg_start_bit;
@@ -2136,11 +2557,8 @@ int ocfs2_free_clusters(handle_t *handle,
/* You can't ever have a contiguous set of clusters
* bigger than a block group bitmap so we never have to worry
- * about looping on them. */
-
- mlog_entry_void();
-
- /* This is expensive. We can safely remove once this stuff has
+ * about looping on them.
+ * This is expensive. We can safely remove once this stuff has
* gotten tested really well. */
BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
@@ -2149,14 +2567,13 @@ int ocfs2_free_clusters(handle_t *handle,
ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
&bg_start_bit);
- mlog(0, "want to free %u clusters starting at block %llu\n",
- num_clusters, (unsigned long long)start_blk);
- mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
- (unsigned long long)bg_blkno, bg_start_bit);
+ trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+ (unsigned long long)start_blk,
+ bg_start_bit, num_clusters);
- status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
- bg_start_bit, bg_blkno,
- num_clusters);
+ status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+ bg_start_bit, bg_blkno,
+ num_clusters, undo_fn);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -2166,10 +2583,37 @@ int ocfs2_free_clusters(handle_t *handle,
num_clusters);
out:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
+int ocfs2_free_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+ start_blk, num_clusters,
+ _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap. We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+ start_blk, num_clusters,
+ _ocfs2_clear_bit);
+}
+
static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
{
printk("Block Group:\n");
@@ -2303,13 +2747,14 @@ out:
* suballoc_bit.
*/
static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
- u16 *suballoc_slot, u16 *suballoc_bit)
+ u16 *suballoc_slot, u64 *group_blkno,
+ u16 *suballoc_bit)
{
int status;
struct buffer_head *inode_bh = NULL;
struct ocfs2_dinode *inode_fe;
- mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
+ trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
/* dirty read disk */
status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2340,11 +2785,14 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
if (suballoc_bit)
*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+ if (group_blkno)
+ *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
bail:
brelse(inode_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2357,29 +2805,31 @@ bail:
*/
static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
struct inode *suballoc,
- struct buffer_head *alloc_bh, u64 blkno,
+ struct buffer_head *alloc_bh,
+ u64 group_blkno, u64 blkno,
u16 bit, int *res)
{
- struct ocfs2_dinode *alloc_fe;
+ struct ocfs2_dinode *alloc_di;
struct ocfs2_group_desc *group;
struct buffer_head *group_bh = NULL;
u64 bg_blkno;
int status;
- mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
- (unsigned int)bit);
+ trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+ (unsigned int)bit);
- alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
- if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+ alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
+ if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
(unsigned int)bit,
- ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+ ocfs2_bits_per_group(&alloc_di->id2.i_chain));
status = -EINVAL;
goto bail;
}
- bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
- status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+ bg_blkno = group_blkno ? group_blkno :
+ ocfs2_which_suballoc_group(blkno, bit);
+ status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
&group_bh);
if (status < 0) {
mlog(ML_ERROR, "read group %llu failed %d\n",
@@ -2393,7 +2843,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
bail:
brelse(group_bh);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2413,14 +2864,15 @@ bail:
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
{
int status;
+ u64 group_blkno = 0;
u16 suballoc_bit = 0, suballoc_slot = 0;
struct inode *inode_alloc_inode;
struct buffer_head *alloc_bh = NULL;
- mlog_entry("blkno: %llu", (unsigned long long)blkno);
+ trace_ocfs2_test_inode_bit((unsigned long long)blkno);
status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
- &suballoc_bit);
+ &group_blkno, &suballoc_bit);
if (status < 0) {
mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
goto bail;
@@ -2442,13 +2894,14 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
if (status < 0) {
mutex_unlock(&inode_alloc_inode->i_mutex);
+ iput(inode_alloc_inode);
mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
(u32)suballoc_slot, status);
goto bail;
}
status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
- blkno, suballoc_bit, res);
+ group_blkno, blkno, suballoc_bit, res);
if (status < 0)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
@@ -2458,6 +2911,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
iput(inode_alloc_inode);
brelse(alloc_bh);
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e..2d2501767c0 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
#ifndef _CHAINALLOC_H_
#define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
u32, /* bits_wanted */
u32, /* min_bits */
u64, /* max_block */
- u16 *, /* *bit_off */
- u16 *); /* *bits_found */
+ struct ocfs2_suballoc_result *);
+ /* found bits */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -48,12 +49,17 @@ struct ocfs2_alloc_context {
/* these are used by the chain search */
u16 ac_chain;
- int ac_allow_chain_relink;
+ int ac_disable_chain_relink;
group_search_t *ac_group_search;
u64 ac_last_group;
u64 ac_max_block; /* Highest block number to allocate. 0 is
is the same as ~0 - unlimited */
+
+ int ac_find_loc_only; /* hack for reflink operation ordering */
+ struct ocfs2_suballoc_result *ac_find_loc_priv; /* */
+
+ struct ocfs2_alloc_reservation *ac_resv;
};
void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +86,37 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits);
+
+int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
+ u64 *suballoc_loc,
u16 *suballoc_bit_start,
u32 *num_bits,
u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_new_inode(handle_t *handle,
struct inode *dir,
struct buffer_head *parent_fe_bh,
struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
u16 *suballoc_bit,
u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 *cluster_start,
@@ -104,8 +125,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
* Use this variant of ocfs2_claim_clusters to specify a maxiumum
* number of clusters smaller than the allocation reserved.
*/
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
- handle_t *handle,
+int __ocfs2_claim_clusters(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 min_clusters,
u32 max_clusters,
@@ -127,6 +147,11 @@ int ocfs2_free_clusters(handle_t *handle,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+ struct inode *bitmap_inode,
+ struct buffer_head *bitmap_bh,
+ u64 start_blk,
+ unsigned int num_clusters);
static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
{
@@ -191,4 +216,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
+
+
+
+/*
+ * The following two interfaces are for ocfs2_create_inode_in_orphan().
+ */
+int ocfs2_find_new_inode_loc(struct inode *dir,
+ struct buffer_head *parent_fe_bh,
+ struct ocfs2_alloc_context *ac,
+ u64 *fe_blkno);
+
+int ocfs2_claim_new_inode_at_loc(handle_t *handle,
+ struct inode *dir,
+ struct ocfs2_alloc_context *ac,
+ u64 *suballoc_loc,
+ u16 *suballoc_bit,
+ u64 di_blkno);
+
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a49..ddb662b3244 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,9 +41,11 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/quotaops.h>
-#include <linux/smp_lock.h>
+#include <linux/cleancache.h>
+
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
-#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -52,6 +54,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "aops.h"
#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
@@ -65,7 +68,6 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "ver.h"
#include "xattr.h"
#include "quota.h"
#include "refcounttree.h"
@@ -73,20 +75,21 @@
#include "buffer_head_io.h"
-static struct kmem_cache *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several differnt types of work which
+/* OCFS2 needs to schedule several different types of work which
* require cluster locking, disk I/O, recovery waits, etc. Since these
* types of work tend to be heavy we avoid using the kernel events
* workqueue and schedule on our own. */
struct workqueue_struct *ocfs2_wq = NULL;
-static struct dentry *ocfs2_debugfs_root = NULL;
+static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
struct mount_options
{
@@ -94,7 +97,9 @@ struct mount_options
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
- unsigned int localalloc_opt;
+ int localalloc_opt;
+ unsigned int resv_level;
+ int dir_resv_level;
char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
@@ -103,7 +108,7 @@ static int ocfs2_parse_options(struct super_block *sb, char *options,
int is_remount);
static int ocfs2_check_set_options(struct super_block *sb,
struct mount_options *options);
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -143,8 +148,7 @@ static const struct super_operations ocfs2_sops = {
.alloc_inode = ocfs2_alloc_inode,
.destroy_inode = ocfs2_destroy_inode,
.drop_inode = ocfs2_drop_inode,
- .clear_inode = ocfs2_clear_inode,
- .delete_inode = ocfs2_delete_inode,
+ .evict_inode = ocfs2_evict_inode,
.sync_fs = ocfs2_sync_fs,
.put_super = ocfs2_put_super,
.remount_fs = ocfs2_remount,
@@ -161,6 +165,7 @@ enum {
Opt_nointr,
Opt_hb_none,
Opt_hb_local,
+ Opt_hb_global,
Opt_data_ordered,
Opt_data_writeback,
Opt_atime_quantum,
@@ -176,6 +181,10 @@ enum {
Opt_noacl,
Opt_usrquota,
Opt_grpquota,
+ Opt_coherency_buffered,
+ Opt_coherency_full,
+ Opt_resv_level,
+ Opt_dir_resv_level,
Opt_err,
};
@@ -187,6 +196,7 @@ static const match_table_t tokens = {
{Opt_nointr, "nointr"},
{Opt_hb_none, OCFS2_HB_NONE},
{Opt_hb_local, OCFS2_HB_LOCAL},
+ {Opt_hb_global, OCFS2_HB_GLOBAL},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
@@ -202,6 +212,10 @@ static const match_table_t tokens = {
{Opt_noacl, "noacl"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
+ {Opt_coherency_buffered, "coherency=buffered"},
+ {Opt_coherency_full, "coherency=full"},
+ {Opt_resv_level, "resv_level=%u"},
+ {Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
};
@@ -272,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
spin_unlock(&osb->osb_lock);
out += snprintf(buf + out, len - out,
- "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit",
+ "%10s => Pid: %d Interval: %lu\n", "Commit",
(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
- osb->osb_commit_interval,
- atomic_read(&osb->needs_checkpoint));
+ osb->osb_commit_interval);
out += snprintf(buf + out, len - out,
"%10s => State: %d TxnId: %lu NumTxns: %d\n",
@@ -431,8 +444,6 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
int status = 0;
int i;
- mlog_entry_void();
-
new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
if (IS_ERR(new)) {
status = PTR_ERR(new);
@@ -468,7 +479,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -478,8 +490,6 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
int status = 0;
int i;
- mlog_entry_void();
-
for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
i < NUM_SYSTEM_INODES;
i++) {
@@ -498,7 +508,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
}
bail:
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -507,13 +518,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
int i;
struct inode *inode;
- mlog_entry_void();
-
- for (i = 0; i < NUM_SYSTEM_INODES; i++) {
- inode = osb->system_inodes[i];
+ for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+ inode = osb->global_system_inodes[i];
if (inode) {
iput(inode);
- osb->system_inodes[i] = NULL;
+ osb->global_system_inodes[i] = NULL;
}
}
@@ -529,7 +538,18 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
osb->root_inode = NULL;
}
- mlog_exit(0);
+ if (!osb->local_system_inodes)
+ return;
+
+ for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+ if (osb->local_system_inodes[i]) {
+ iput(osb->local_system_inodes[i]);
+ osb->local_system_inodes[i] = NULL;
+ }
+ }
+
+ kfree(osb->local_system_inodes);
+ osb->local_system_inodes = NULL;
}
/* We're allocating fs objects, use GFP_NOFS */
@@ -541,15 +561,24 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
if (!oi)
return NULL;
+ oi->i_sync_tid = 0;
+ oi->i_datasync_tid = 0;
+
jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
-static void ocfs2_destroy_inode(struct inode *inode)
+static void ocfs2_i_callback(struct rcu_head *head)
{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
}
+static void ocfs2_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, ocfs2_i_callback);
+}
+
static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
unsigned int cbits)
{
@@ -603,8 +632,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
int ret = 0;
struct mount_options parsed_options;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ u32 tmp;
- lock_kernel();
+ sync_filesystem(sb);
if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
!ocfs2_check_set_options(sb, &parsed_options)) {
@@ -612,8 +642,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
- (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE;
+ if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
goto out;
@@ -653,12 +684,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
}
if (*flags & MS_RDONLY) {
- mlog(0, "Going to ro mode.\n");
sb->s_flags |= MS_RDONLY;
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
} else {
- mlog(0, "Making ro filesystem writeable.\n");
-
if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
mlog(ML_ERROR, "Cannot remount RDWR "
"filesystem due to previous errors.\n");
@@ -676,6 +704,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
}
+ trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
unlock_osb:
spin_unlock(&osb->osb_lock);
/* Enable quota accounting after remounting RW */
@@ -712,7 +741,6 @@ unlock_osb:
MS_POSIXACL : 0);
}
out:
- unlock_kernel();
return ret;
}
@@ -804,23 +832,29 @@ bail:
static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
{
- if (ocfs2_mount_local(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+ if (osb->s_mount_opt & hb_enabled) {
+ if (ocfs2_mount_local(osb)) {
mlog(ML_ERROR, "Cannot heartbeat on a locally "
"mounted device.\n");
return -EINVAL;
}
- }
-
- if (ocfs2_userspace_stack(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ if (ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Userspace stack expected, but "
"o2cb heartbeat arguments passed to mount\n");
return -EINVAL;
}
+ if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+ !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+ ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+ ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+ mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+ return -EINVAL;
+ }
}
- if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ if (!(osb->s_mount_opt & hb_enabled)) {
if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
!ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -873,13 +907,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
continue;
if (unsuspend)
- status = vfs_quota_enable(
- sb_dqopt(sb)->files[type],
- type, QFMT_OCFS2,
- DQUOT_SUSPENDED);
- else
- status = vfs_quota_disable(sb, type,
- DQUOT_SUSPENDED);
+ status = dquot_resume(sb, type);
+ else {
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ /* Cancel periodic syncing before suspending */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ status = dquot_suspend(sb, type);
+ }
if (status < 0)
break;
}
@@ -910,8 +946,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
status = -ENOENT;
goto out_quota_off;
}
- status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
- DQUOT_USAGE_ENABLED);
+ status = dquot_enable(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
if (status < 0)
goto out_quota_off;
}
@@ -932,18 +968,22 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
int type;
struct inode *inode;
struct super_block *sb = osb->sb;
+ struct ocfs2_mem_dqinfo *oinfo;
/* We mostly ignore errors in this function because there's not much
* we can do when we see them */
for (type = 0; type < MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
+ /* Cancel periodic syncing before we grab dqonoff_mutex */
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
inode = igrab(sb->s_dquot.files[type]);
/* Turn off quotas. This will remove all dquot structures from
* memory and so they will be automatically synced to global
* quota files */
- vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
- DQUOT_LIMITS_ENABLED);
+ dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
if (!inode)
continue;
iput(inode);
@@ -951,8 +991,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
}
/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
- char *path, int remount)
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
{
unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -960,30 +999,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
return -EINVAL;
- if (remount)
- return 0; /* Just ignore it has been handled in
- * ocfs2_remount() */
- return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
- format_id, DQUOT_LIMITS_ENABLED);
+ return dquot_enable(sb_dqopt(sb)->files[type], type,
+ format_id, DQUOT_LIMITS_ENABLED);
}
/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
{
- if (remount)
- return 0; /* Ignore now and handle later in
- * ocfs2_remount() */
- return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+ return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
static const struct quotactl_ops ocfs2_quotactl_ops = {
- .quota_on = ocfs2_quota_on,
+ .quota_on_meta = ocfs2_quota_on,
.quota_off = ocfs2_quota_off,
- .quota_sync = vfs_quota_sync,
- .get_info = vfs_get_dqinfo,
- .set_info = vfs_set_dqinfo,
- .get_dqblk = vfs_get_dqblk,
- .set_dqblk = vfs_set_dqblk,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
};
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
@@ -994,10 +1027,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode = NULL;
struct ocfs2_super *osb = NULL;
struct buffer_head *bh = NULL;
- char nodestr[8];
+ char nodestr[12];
struct ocfs2_blockcheck_stats stats;
- mlog_entry("%p, %p, %i", sb, data, silent);
+ trace_ocfs2_fill_super(sb, data, silent);
if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
status = -EINVAL;
@@ -1028,8 +1061,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
- osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
- osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+ ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+ osb->osb_resv_level = parsed_options.resv_level;
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ if (parsed_options.dir_resv_level == -1)
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ else
+ osb->osb_dir_resv_level = parsed_options.dir_resv_level;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -1037,7 +1076,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = OCFS2_SUPER_MAGIC;
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -1072,9 +1111,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
ocfs2_set_ro_flag(osb, 1);
- printk(KERN_NOTICE "Readonly device detected. No cluster "
- "services will be utilized for this mount. Recovery "
- "will be skipped.\n");
+ printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+ "Cluster services will not be used for this mount. "
+ "Recovery will be skipped.\n", osb->dev_str);
}
if (!ocfs2_is_hard_readonly(osb)) {
@@ -1119,19 +1158,19 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
status = ocfs2_mount_volume(sb);
- if (osb->root_inode)
- inode = igrab(osb->root_inode);
-
if (status < 0)
goto read_super_error;
+ if (osb->root_inode)
+ inode = igrab(osb->root_inode);
+
if (!inode) {
status = -EIO;
mlog_errno(status);
goto read_super_error;
}
- root = d_alloc_root(inode);
+ root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
mlog_errno(status);
@@ -1167,7 +1206,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog_errno(status);
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
- mlog_exit(status);
return status;
}
}
@@ -1181,63 +1219,39 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
- mlog_exit(status);
return status;
read_super_error:
brelse(bh);
- if (inode)
- iput(inode);
-
if (osb) {
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
ocfs2_dismount_volume(sb, 1);
}
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
-static int ocfs2_get_sb(struct file_system_type *fs_type,
+static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
int flags,
const char *dev_name,
- void *data,
- struct vfsmount *mnt)
+ void *data)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
- mnt);
-}
-
-static void ocfs2_kill_sb(struct super_block *sb)
-{
- struct ocfs2_super *osb = OCFS2_SB(sb);
-
- /* Failed mount? */
- if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
- goto out;
-
- /* Prevent further queueing of inode drop events */
- spin_lock(&dentry_list_lock);
- ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
- spin_unlock(&dentry_list_lock);
- /* Wait for work to finish and/or remove it */
- cancel_work_sync(&osb->dentry_lock_work);
-out:
- kill_block_super(sb);
+ return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
}
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
- .get_sb = ocfs2_get_sb, /* is this called when we mount
- * the fs? */
- .kill_sb = ocfs2_kill_sb,
-
+ .mount = ocfs2_mount,
+ .kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
.next = NULL
};
+MODULE_ALIAS_FS("ocfs2");
static int ocfs2_check_set_options(struct super_block *sb,
struct mount_options *options)
@@ -1278,18 +1292,20 @@ static int ocfs2_parse_options(struct super_block *sb,
struct mount_options *mopt,
int is_remount)
{
- int status;
+ int status, user_stack = 0;
char *p;
+ u32 tmp;
- mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
- options ? options : "(none)");
+ trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
mopt->commit_interval = 0;
- mopt->mount_opt = 0;
+ mopt->mount_opt = OCFS2_MOUNT_NOINTR;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
- mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ mopt->localalloc_opt = -1;
mopt->cluster_stack[0] = '\0';
+ mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+ mopt->dir_resv_level = -1;
if (!options) {
status = 1;
@@ -1309,7 +1325,10 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
break;
case Opt_hb_none:
- mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+ mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+ break;
+ case Opt_hb_global:
+ mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
break;
case Opt_barrier:
if (match_int(&args[0], &option)) {
@@ -1380,7 +1399,7 @@ static int ocfs2_parse_options(struct super_block *sb,
status = 0;
goto bail;
}
- if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+ if (option >= 0)
mopt->localalloc_opt = option;
break;
case Opt_localflocks:
@@ -1415,6 +1434,15 @@ static int ocfs2_parse_options(struct super_block *sb,
memcpy(mopt->cluster_stack, args[0].from,
OCFS2_STACK_LABEL_LEN);
mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ /*
+ * Open code the memcmp here as we don't have
+ * an osb to pass to
+ * ocfs2_userspace_stack().
+ */
+ if (memcmp(mopt->cluster_stack,
+ OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ user_stack = 1;
break;
case Opt_inode64:
mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1425,6 +1453,12 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_grpquota:
mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
break;
+ case Opt_coherency_buffered:
+ mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
+ case Opt_coherency_full:
+ mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
case Opt_acl:
mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1433,6 +1467,28 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
break;
+ case Opt_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->resv_level = option;
+ break;
+ case Opt_dir_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->dir_resv_level = option;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1442,23 +1498,38 @@ static int ocfs2_parse_options(struct super_block *sb,
}
}
+ if (user_stack == 0) {
+ /* Ensure only one heartbeat mode */
+ tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+ OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE);
+ if (hweight32(tmp) != 1) {
+ mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+ status = 0;
+ goto bail;
+ }
+ }
+
status = 1;
bail:
- mlog_exit(status);
return status;
}
-static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
{
- struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
+ struct ocfs2_super *osb = OCFS2_SB(root->d_sb);
unsigned long opts = osb->s_mount_opt;
unsigned int local_alloc_megs;
- if (opts & OCFS2_MOUNT_HB_LOCAL)
- seq_printf(s, ",_netdev,heartbeat=local");
- else
- seq_printf(s, ",heartbeat=none");
+ if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+ seq_printf(s, ",_netdev");
+ if (opts & OCFS2_MOUNT_HB_LOCAL)
+ seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+ else
+ seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+ } else
+ seq_printf(s, ",%s", OCFS2_HB_NONE);
if (opts & OCFS2_MOUNT_NOINTR)
seq_printf(s, ",nointr");
@@ -1479,15 +1550,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->preferred_slot != OCFS2_INVALID_SLOT)
seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
- if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
- seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+ seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
if (osb->osb_commit_interval)
seq_printf(s, ",commit=%u",
(unsigned) (osb->osb_commit_interval / HZ));
local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
- if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ if (local_alloc_megs != ocfs2_la_default_mb(osb))
seq_printf(s, ",localalloc=%d", local_alloc_megs);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1501,6 +1571,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_GRPQUOTA)
seq_printf(s, ",grpquota");
+ if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+ seq_printf(s, ",coherency=buffered");
+ else
+ seq_printf(s, ",coherency=full");
+
if (opts & OCFS2_MOUNT_NOUSERXATTR)
seq_printf(s, ",nouser_xattr");
else
@@ -1514,6 +1589,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
else
seq_printf(s, ",noacl");
+ if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+ seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+
+ if (osb->osb_dir_resv_level != osb->osb_resv_level)
+ seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
return 0;
}
@@ -1521,26 +1602,18 @@ static int __init ocfs2_init(void)
{
int status;
- mlog_entry_void();
-
- ocfs2_print_version();
-
status = init_ocfs2_uptodate_cache();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out1;
status = ocfs2_initialize_mem_caches();
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ if (status < 0)
+ goto out2;
ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
if (!ocfs2_wq) {
status = -ENOMEM;
- goto leave;
+ goto out3;
}
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
@@ -1549,34 +1622,30 @@ static int __init ocfs2_init(void)
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
- status = ocfs2_quota_setup();
- if (status)
- goto leave;
-
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
-leave:
- if (status < 0) {
- ocfs2_quota_shutdown();
- ocfs2_free_mem_caches();
- exit_ocfs2_uptodate_cache();
- }
-
- mlog_exit(status);
+ if (status < 0)
+ goto out4;
+ status = register_filesystem(&ocfs2_fs_type);
+ if (!status)
+ return 0;
- if (status >= 0) {
- return register_filesystem(&ocfs2_fs_type);
- } else
- return -1;
+ unregister_quota_format(&ocfs2_quota_format);
+out4:
+ destroy_workqueue(ocfs2_wq);
+ debugfs_remove(ocfs2_debugfs_root);
+out3:
+ ocfs2_free_mem_caches();
+out2:
+ exit_ocfs2_uptodate_cache();
+out1:
+ mlog_errno(status);
+ return status;
}
static void __exit ocfs2_exit(void)
{
- mlog_entry_void();
-
- ocfs2_quota_shutdown();
-
if (ocfs2_wq) {
flush_workqueue(ocfs2_wq);
destroy_workqueue(ocfs2_wq);
@@ -1591,22 +1660,14 @@ static void __exit ocfs2_exit(void)
unregister_filesystem(&ocfs2_fs_type);
exit_ocfs2_uptodate_cache();
-
- mlog_exit_void();
}
static void ocfs2_put_super(struct super_block *sb)
{
- mlog_entry("(0x%p)\n", sb);
-
- lock_kernel();
+ trace_ocfs2_put_super(sb);
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
-
- unlock_kernel();
-
- mlog_exit_void();
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1618,7 +1679,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
struct buffer_head *bh = NULL;
struct inode *inode = NULL;
- mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
+ trace_ocfs2_statfs(dentry->d_sb, buf);
osb = OCFS2_SB(dentry->d_sb);
@@ -1665,7 +1726,8 @@ bail:
if (inode)
iput(inode);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -1680,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_dir_start_lookup = 0;
-
+ mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1688,6 +1750,8 @@ static void ocfs2_inode_init_once(void *data)
oi->ip_blkno = 0ULL;
oi->ip_clusters = 0;
+ ocfs2_resv_init_once(&oi->ip_la_data_resv);
+
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -1733,6 +1797,11 @@ static int ocfs2_initialize_mem_caches(void)
static void ocfs2_free_mem_caches(void)
{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
if (ocfs2_inode_cachep)
kmem_cache_destroy(ocfs2_inode_cachep);
ocfs2_inode_cachep = NULL;
@@ -1758,8 +1827,8 @@ static int ocfs2_get_sector(struct super_block *sb,
*bh = sb_getblk(sb, block);
if (!*bh) {
- mlog_errno(-EIO);
- return -EIO;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
lock_buffer(*bh);
if (!buffer_dirty(*bh))
@@ -1783,8 +1852,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
- mlog_entry_void();
-
if (ocfs2_is_hard_readonly(osb))
goto leave;
@@ -1829,7 +1896,6 @@ leave:
if (unlock_super)
ocfs2_super_unlock(osb, 1);
- mlog_exit(status);
return status;
}
@@ -1837,9 +1903,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
{
int tmp, hangup_needed = 0;
struct ocfs2_super *osb = NULL;
- char nodestr[8];
+ char nodestr[12];
- mlog_entry("(0x%p)\n", sb);
+ trace_ocfs2_dismount_volume(sb);
BUG_ON(!sb);
osb = OCFS2_SB(sb);
@@ -1847,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
debugfs_remove(osb->osb_ctxt);
- /*
- * Flush inode dropping work queue so that deletes are
- * performed while the filesystem is still working
- */
- ocfs2_drop_all_dl_inodes(osb);
-
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
ocfs2_disable_quotas(osb);
+ /* All dquots should be freed by now */
+ WARN_ON(!llist_empty(&osb->dquot_drop_list));
+ /* Wait for worker to be done with the work structure in osb */
+ cancel_work_sync(&osb->dquot_drop_work);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
@@ -1895,7 +1960,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
* If we failed before we got a uuid_str yet, we can't stop
* heartbeat. Otherwise, do it.
*/
- if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+ !ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
if (osb->cconn)
@@ -1947,6 +2013,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
return 0;
}
+/* Make sure entire volume is addressable by our journal. Requires
+ osb_clusters_at_boot to be valid and for the journal to have been
+ initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+ int status = 0;
+ u64 max_block =
+ ocfs2_clusters_to_blocks(osb->sb,
+ osb->osb_clusters_at_boot) - 1;
+
+ /* 32-bit block number is always OK. */
+ if (max_block <= (u32)~0ULL)
+ goto out;
+
+ /* Volume is "huge", so see if our journal is new enough to
+ support it. */
+ if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+ jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT))) {
+ mlog(ML_ERROR, "The journal cannot address the entire volume. "
+ "Enable the 'block64' journal option with tunefs.ocfs2");
+ status = -EFBIG;
+ goto out;
+ }
+
+ out:
+ return status;
+}
+
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
int sector_size,
@@ -1957,10 +2053,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
struct ocfs2_journal *journal;
- __le32 uuid_net_key;
struct ocfs2_super *osb;
-
- mlog_entry_void();
+ u64 total_blocks;
osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
if (!osb) {
@@ -1971,6 +2065,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
+ sb->s_d_op = &ocfs2_dentry_ops;
sb->s_export_op = &ocfs2_export_ops;
sb->s_qcop = &ocfs2_quotactl_ops;
sb->dq_op = &ocfs2_quota_operations;
@@ -2003,6 +2098,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
spin_lock_init(&osb->osb_xattr_lock);
ocfs2_init_steal_slots(osb);
+ mutex_init(&osb->system_file_mutex);
+
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2017,6 +2114,14 @@ static int ocfs2_initialize_super(struct super_block *sb,
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+ if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+ mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+ osb->max_slots);
+ status = -EINVAL;
+ goto bail;
+ }
+
ocfs2_orphan_scan_init(osb);
status = ocfs2_recovery_init(osb);
@@ -2027,7 +2132,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
init_waitqueue_head(&osb->checkpoint_event);
- atomic_set(&osb->needs_checkpoint, 0);
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
@@ -2042,6 +2146,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
init_waitqueue_head(&osb->osb_mount_event);
+ status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2049,15 +2159,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
- if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
- mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
- osb->max_slots);
- status = -EINVAL;
- goto bail;
- }
- mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
osb->slot_recovery_generations =
kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
GFP_KERNEL);
@@ -2100,11 +2201,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_userspace_stack(osb)) {
- memcpy(osb->osb_cluster_stack,
+ if (ocfs2_clusterinfo_valid(osb)) {
+ osb->osb_stackflags =
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
+ strlcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN);
- osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ OCFS2_STACK_LABEL_LEN + 1);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2113,6 +2215,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
+ strlcpy(osb->osb_cluster_name,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+ OCFS2_CLUSTER_NAME_LEN + 1);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
@@ -2148,14 +2253,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
journal->j_state = OCFS2_JOURNAL_FREE;
- INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
- osb->dentry_lock_list = NULL;
+ INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
+ init_llist_head(&osb->dquot_drop_list);
/* get some pseudo constants for clustersize bits */
osb->s_clustersize_bits =
le32_to_cpu(di->id2.i_super.s_clustersize_bits);
osb->s_clustersize = 1 << osb->s_clustersize_bits;
- mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -2165,11 +2269,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
- > (u32)~0UL) {
- mlog(ML_ERROR, "Volume might try to write to blocks beyond "
- "what jbd can address in 32 bits.\n");
- status = -EINVAL;
+ total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+ le32_to_cpu(di->i_clusters));
+
+ status = generic_check_addressable(osb->sb->s_blocksize_bits,
+ total_blocks);
+ if (status) {
+ mlog(ML_ERROR, "Volume too large "
+ "to mount safely on this system");
+ status = -EFBIG;
goto bail;
}
@@ -2180,21 +2288,18 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-
- strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
- osb->vol_label[63] = '\0';
+ strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno =
le64_to_cpu(di->id2.i_super.s_first_cluster_group);
osb->fs_generation = le32_to_cpu(di->i_fs_generation);
osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
- mlog(0, "vol_label: %s\n", osb->vol_label);
- mlog(0, "uuid: %s\n", osb->uuid_str);
- mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
- (unsigned long long)osb->root_blkno,
- (unsigned long long)osb->system_dir_blkno);
+ trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+ (unsigned long long)osb->root_blkno,
+ (unsigned long long)osb->system_dir_blkno,
+ osb->s_clustersize_bits);
osb->osb_dlm_debug = ocfs2_new_dlm_debug();
if (!osb->osb_dlm_debug) {
@@ -2224,18 +2329,20 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+ osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
iput(inode);
- osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+ osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+ osb->s_feature_incompat) * 8;
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
goto bail;
}
+ cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
bail:
- mlog_exit(status);
return status;
}
@@ -2251,8 +2358,6 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
{
int status = -EAGAIN;
- mlog_entry_void();
-
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
/* We have to do a raw check of the feature here */
@@ -2307,7 +2412,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
}
out:
- mlog_exit(status);
+ if (status && status != -EAGAIN)
+ mlog_errno(status);
return status;
}
@@ -2320,8 +2426,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* recover
* ourselves. */
- mlog_entry_void();
-
/* Init our journal object. */
status = ocfs2_journal_init(osb->journal, &dirty);
if (status < 0) {
@@ -2329,6 +2433,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ /* Now that journal has been initialized, check to make sure
+ entire volume is addressable. */
+ status = ocfs2_journal_addressable(osb);
+ if (status)
+ goto finally;
+
/* If the journal was unmounted cleanly then we don't want to
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
@@ -2339,8 +2449,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
} else {
- mlog(ML_NOTICE, "File system was not unmounted cleanly, "
- "recovering volume.\n");
+ printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+ "unmounted cleanly, recovering it.\n", osb->dev_str);
}
local = ocfs2_mount_local(osb);
@@ -2365,8 +2475,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
* ourselves as mounted. */
}
- mlog(0, "Journal loaded.\n");
-
status = ocfs2_load_local_alloc(osb);
if (status < 0) {
mlog_errno(status);
@@ -2395,10 +2503,10 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
mlog_errno(status);
finally:
- if (local_alloc)
- kfree(local_alloc);
+ kfree(local_alloc);
- mlog_exit(status);
+ if (status)
+ mlog_errno(status);
return status;
}
@@ -2410,8 +2518,6 @@ finally:
*/
static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
- mlog_entry_void();
-
/* This function assumes that the caller has the main osb resource */
ocfs2_free_slot_info(osb);
@@ -2420,17 +2526,14 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->slot_recovery_generations);
/* FIXME
* This belongs in journal shutdown, but because we have to
- * allocate osb->journal at the start of ocfs2_initalize_osb(),
+ * allocate osb->journal at the start of ocfs2_initialize_osb(),
* we free it here.
*/
kfree(osb->journal);
- if (osb->local_alloc_copy)
- kfree(osb->local_alloc_copy);
+ kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
ocfs2_put_dlm_debug(osb->osb_dlm_debug);
memset(osb, 0, sizeof(struct ocfs2_super));
-
- mlog_exit_void();
}
/* Put OCFS2 into a readonly state, or (if the user specifies it),
@@ -2509,5 +2612,25 @@ void __ocfs2_abort(struct super_block* sb,
ocfs2_handle_error(sb);
}
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+ int rc;
+ sigset_t blocked;
+
+ sigfillset(&blocked);
+ rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+ BUG_ON(rc);
+}
+
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+ int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+ BUG_ON(rc);
+}
+
module_init(ocfs2_init);
module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a..74ff74cf78f 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -31,18 +31,23 @@ extern struct workqueue_struct *ocfs2_wq;
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-void __ocfs2_abort(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
+__printf(3, 4)
+void __ocfs2_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...);
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
+
#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc..66edce7ecfd 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -40,7 +40,6 @@
#include <linux/pagemap.h>
#include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -55,106 +54,40 @@
#include "buffer_head_io.h"
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
- struct buffer_head **bh)
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
{
- int status;
- char *link = NULL;
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *bh = NULL;
+ int status = ocfs2_read_inode_block(inode, &bh);
struct ocfs2_dinode *fe;
+ const char *link;
+ void *kaddr;
+ size_t len;
- mlog_entry_void();
-
- status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
- link = ERR_PTR(status);
- goto bail;
+ return status;
}
- fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ fe = (struct ocfs2_dinode *) bh->b_data;
link = (char *) fe->id2.i_symlink;
-bail:
- mlog_exit(status);
-
- return link;
-}
-
-static int ocfs2_readlink(struct dentry *dentry,
- char __user *buffer,
- int buflen)
-{
- int ret;
- char *link;
- struct buffer_head *bh = NULL;
- struct inode *inode = dentry->d_inode;
-
- mlog_entry_void();
-
- link = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(link)) {
- ret = PTR_ERR(link);
- goto out;
- }
-
- /*
- * Without vfsmount we can't update atime now,
- * but we will update atime here ultimately.
- */
- ret = vfs_readlink(dentry, buffer, buflen, link);
-
- brelse(bh);
-out:
- mlog_exit(ret);
- return ret;
-}
-
-static void *ocfs2_fast_follow_link(struct dentry *dentry,
- struct nameidata *nd)
-{
- int status = 0;
- int len;
- char *target, *link = ERR_PTR(-ENOMEM);
- struct inode *inode = dentry->d_inode;
- struct buffer_head *bh = NULL;
-
- mlog_entry_void();
-
- BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
- target = ocfs2_fast_symlink_getlink(inode, &bh);
- if (IS_ERR(target)) {
- status = PTR_ERR(target);
- mlog_errno(status);
- goto bail;
- }
-
- /* Fast symlinks can't be large */
- len = strlen(target);
- link = kzalloc(len + 1, GFP_NOFS);
- if (!link) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- memcpy(link, target, len);
-
-bail:
- nd_set_link(nd, status ? ERR_PTR(status) : link);
+ /* will be less than a page size */
+ len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, link, len + 1);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ unlock_page(page);
brelse(bh);
-
- mlog_exit(status);
- return NULL;
+ return 0;
}
-static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
- char *link = nd_get_link(nd);
- if (!IS_ERR(link))
- kfree(link);
-}
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+ .readpage = ocfs2_fast_symlink_readpage,
+};
const struct inode_operations ocfs2_symlink_inode_operations = {
- .readlink = page_readlink,
+ .readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.getattr = ocfs2_getattr,
@@ -165,15 +98,3 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
};
-const struct inode_operations ocfs2_fast_symlink_inode_operations = {
- .readlink = ocfs2_readlink,
- .follow_link = ocfs2_fast_follow_link,
- .put_link = ocfs2_fast_put_link,
- .getattr = ocfs2_getattr,
- .setattr = ocfs2_setattr,
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ocfs2_listxattr,
- .removexattr = generic_removexattr,
- .fiemap = ocfs2_fiemap,
-};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 65a6c9c6ad5..71ee4245e91 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -27,7 +27,7 @@
#define OCFS2_SYMLINK_H
extern const struct inode_operations ocfs2_symlink_inode_operations;
-extern const struct inode_operations ocfs2_fast_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
/*
* Test whether an inode is a fast symlink.
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948..af155c18312 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,10 +25,8 @@
#include <linux/fs.h>
#include <linux/types.h>
-#include <linux/slab.h>
#include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -45,11 +43,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
#endif
@@ -60,11 +53,51 @@ static inline int is_global_system_inode(int type)
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
}
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+ int type,
+ u32 slot)
{
- return slot == osb->slot_num || is_global_system_inode(type);
+ int index;
+ struct inode **local_system_inodes, **free = NULL;
+
+ BUG_ON(slot == OCFS2_INVALID_SLOT);
+ BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+ type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+ spin_lock(&osb->osb_lock);
+ local_system_inodes = osb->local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+
+ if (unlikely(!local_system_inodes)) {
+ local_system_inodes = kzalloc(sizeof(struct inode *) *
+ NUM_LOCAL_SYSTEM_INODES *
+ osb->max_slots,
+ GFP_NOFS);
+ if (!local_system_inodes) {
+ mlog_errno(-ENOMEM);
+ /*
+ * return NULL here so that ocfs2_get_sytem_file_inodes
+ * will try to create an inode and use it. We will try
+ * to initialize local_system_inodes next time.
+ */
+ return NULL;
+ }
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_system_inodes) {
+ /* Someone has initialized it for us. */
+ free = local_system_inodes;
+ local_system_inodes = osb->local_system_inodes;
+ } else
+ osb->local_system_inodes = local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+ kfree(free);
+ }
+
+ index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+ (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+ return &local_system_inodes[index];
}
struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -75,12 +108,16 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
- if (is_in_system_inode_array(osb, type, slot))
- arr = &(osb->system_inodes[type]);
+ if (is_global_system_inode(type)) {
+ arr = &(osb->global_system_inodes[type]);
+ } else
+ arr = get_local_system_inode(osb, type, slot);
+ mutex_lock(&osb->system_file_mutex);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
inode = igrab(inode);
+ mutex_unlock(&osb->system_file_mutex);
BUG_ON(!inode);
return inode;
@@ -94,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
*arr = igrab(inode);
BUG_ON(!*arr);
}
+ mutex_unlock(&osb->system_file_mutex);
return inode;
}
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index a0a120e82b9..82e17b076ce 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -54,21 +54,20 @@
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#define MLOG_MASK_PREFIX ML_UPTODATE
-
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "inode.h"
#include "uptodate.h"
+#include "ocfs2_trace.h"
struct ocfs2_meta_cache_item {
struct rb_node c_node;
sector_t c_block;
};
-static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep;
u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
{
@@ -152,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
while ((node = rb_last(root)) != NULL) {
item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
- mlog(0, "Purge item %llu\n",
- (unsigned long long) item->c_block);
+ trace_ocfs2_purge_copied_metadata_tree(
+ (unsigned long long) item->c_block);
rb_erase(&item->c_node, root);
kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -180,9 +179,9 @@ void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
to_purge = ci->ci_num_cached;
- mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
- tree ? "array" : "tree",
- (unsigned long long)ocfs2_metadata_cache_owner(ci));
+ trace_ocfs2_metadata_cache_purge(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ to_purge, tree);
/* If we're a tree, save off the root so that we can safely
* initialize the cache. We do the work to free tree members
@@ -249,10 +248,10 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
ocfs2_metadata_cache_lock(ci);
- mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long) bh->b_blocknr,
- !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
+ trace_ocfs2_buffer_cached_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long) bh->b_blocknr,
+ !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
index = ocfs2_search_cache_array(ci, bh->b_blocknr);
@@ -261,7 +260,7 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
ocfs2_metadata_cache_unlock(ci);
- mlog(0, "index = %d, item = %p\n", index, item);
+ trace_ocfs2_buffer_cached_end(index, item);
return (index != -1) || (item != NULL);
}
@@ -306,8 +305,9 @@ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
{
BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
- mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
- ci->ci_num_cached);
+ trace_ocfs2_append_cache_array(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, ci->ci_num_cached);
ci->ci_cache.ci_array[ci->ci_num_cached] = block;
ci->ci_num_cached++;
@@ -324,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
struct ocfs2_meta_cache_item *tmp;
- mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
- ci->ci_num_cached);
+ trace_ocfs2_insert_cache_tree(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, ci->ci_num_cached);
while(*p) {
parent = *p;
@@ -389,9 +390,9 @@ static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
tree[i] = NULL;
}
- mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- ci->ci_flags, ci->ci_num_cached);
+ trace_ocfs2_expand_cache(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ ci->ci_flags, ci->ci_num_cached);
}
/* Slow path function - memory allocation is necessary. See the
@@ -405,9 +406,9 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
{ NULL, };
- mlog(0, "Owner %llu, block %llu, expand = %d\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)block, expand_tree);
+ trace_ocfs2_set_buffer_uptodate(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)block, expand_tree);
new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
if (!new) {
@@ -433,7 +434,6 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
ocfs2_metadata_cache_lock(ci);
if (ocfs2_insert_can_use_array(ci)) {
- mlog(0, "Someone cleared the tree underneath us\n");
/* Ok, items were removed from the cache in between
* locks. Detect this and revert back to the fast path */
ocfs2_append_cache_array(ci, block);
@@ -490,9 +490,9 @@ void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
if (ocfs2_buffer_cached(ci, bh))
return;
- mlog(0, "Owner %llu, inserting block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_set_buffer_uptodate_begin(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)bh->b_blocknr);
/* No need to recheck under spinlock - insertion is guarded by
* co_io_lock() */
@@ -542,8 +542,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
BUG_ON(index >= ci->ci_num_cached);
BUG_ON(!ci->ci_num_cached);
- mlog(0, "remove index %d (num_cached = %u\n", index,
- ci->ci_num_cached);
+ trace_ocfs2_remove_metadata_array(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ index, ci->ci_num_cached);
ci->ci_num_cached--;
@@ -559,8 +560,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *item)
{
- mlog(0, "remove block %llu from tree\n",
- (unsigned long long) item->c_block);
+ trace_ocfs2_remove_metadata_tree(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)item->c_block);
rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
ci->ci_num_cached--;
@@ -573,10 +575,10 @@ static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
struct ocfs2_meta_cache_item *item = NULL;
ocfs2_metadata_cache_lock(ci);
- mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long) block, ci->ci_num_cached,
- ci->ci_flags & OCFS2_CACHE_FL_INLINE);
+ trace_ocfs2_remove_block_from_cache(
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long) block, ci->ci_num_cached,
+ ci->ci_flags);
if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
index = ocfs2_search_cache_array(ci, block);
@@ -626,9 +628,6 @@ int __init init_ocfs2_uptodate_cache(void)
if (!ocfs2_uptodate_cachep)
return -ENOMEM;
- mlog(0, "%u inlined cache items per inode.\n",
- OCFS2_CACHE_INFO_MAX_ARRAY);
-
return 0;
}
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a..00000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2..00000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-
-void ocfs2_print_version(void);
-
-#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d..016f01df382 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,7 +37,6 @@
#include <linux/string.h>
#include <linux/security.h>
-#define MLOG_MASK_PREFIX ML_XATTR
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -57,6 +56,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "acl.h"
+#include "ocfs2_trace.h"
struct ocfs2_xattr_def_value_root {
struct ocfs2_xattr_value_root xv;
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
struct ocfs2_alloc_context *meta_ac;
struct ocfs2_alloc_context *data_ac;
struct ocfs2_cached_dealloc_ctxt dealloc;
+ int set_abort;
};
#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,21 +97,21 @@ static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
};
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
- &ocfs2_xattr_acl_access_handler,
- &ocfs2_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
};
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
- = &ocfs2_xattr_acl_access_handler,
+ = &posix_acl_access_xattr_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
- = &ocfs2_xattr_acl_default_handler,
+ = &posix_acl_default_xattr_handler,
[OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
[OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
@@ -368,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
* them fully.
*/
static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
- u64 xb_blkno)
+ u64 xb_blkno, int new)
{
int i, rc = 0;
@@ -376,15 +377,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
xb_blkno + i);
if (!bucket->bu_bhs[i]) {
- rc = -EIO;
+ rc = -ENOMEM;
mlog_errno(rc);
break;
}
if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
- bucket->bu_bhs[i]))
- ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
- bucket->bu_bhs[i]);
+ bucket->bu_bhs[i])) {
+ if (new)
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ else {
+ set_buffer_uptodate(bucket->bu_bhs[i]);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ }
+ }
}
if (rc)
@@ -473,8 +481,7 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)bh->b_data;
- mlog(0, "Validating xattr block %llu\n",
- (unsigned long long)bh->b_blocknr);
+ trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
@@ -539,7 +546,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
static inline const char *ocfs2_xattr_prefix(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
@@ -623,7 +630,7 @@ int ocfs2_calc_security_init(struct inode *dir,
int ocfs2_calc_xattr_init(struct inode *dir,
struct buffer_head *dir_bh,
- int mode,
+ umode_t mode,
struct ocfs2_security_xattr_info *si,
int *want_clusters,
int *xattr_credits,
@@ -708,52 +715,61 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
struct ocfs2_xattr_value_buf *vb,
struct ocfs2_xattr_set_ctxt *ctxt)
{
- int status = 0;
+ int status = 0, credits;
handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
struct ocfs2_extent_tree et;
- mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-
ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
- status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ while (clusters_to_add) {
+ trace_ocfs2_xattr_extend_allocation(clusters_to_add);
- prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
- status = ocfs2_add_clusters_in_btree(handle,
- &et,
- &logical_start,
- clusters_to_add,
- 0,
- ctxt->data_ac,
- ctxt->meta_ac,
- &why);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ break;
+ }
- status = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(handle,
+ &et,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ ctxt->data_ac,
+ ctxt->meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ break;
+ }
- clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+ ocfs2_journal_dirty(handle, vb->vb_bh);
- /*
- * We should have already allocated enough space before the transaction,
- * so no need to restart.
- */
- BUG_ON(why != RESTART_NONE || clusters_to_add);
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+ prev_clusters;
-leave:
+ if (why != RESTART_NONE && clusters_to_add) {
+ /*
+ * We can only fail in case the alloc file doesn't give
+ * up enough clusters.
+ */
+ BUG_ON(why == RESTART_META);
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &vb->vb_xv->xr_list);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ break;
+ }
+ }
+ }
return status;
}
@@ -786,12 +802,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
}
le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-
- ret = ocfs2_journal_dirty(handle, vb->vb_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, vb->vb_bh);
if (ext_flags & OCFS2_EXT_REFCOUNTED)
ret = ocfs2_decrease_refcount(inode, handle,
@@ -1278,13 +1289,11 @@ int ocfs2_xattr_get_nolock(struct inode *inode,
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
- down_read(&oi->ip_xattr_sem);
ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size, &xis);
if (ret == -ENODATA && di->i_xattr_loc)
ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
buffer_size, &xbs);
- up_read(&oi->ip_xattr_sem);
return ret;
}
@@ -1308,8 +1317,10 @@ static int ocfs2_xattr_get(struct inode *inode,
mlog_errno(ret);
return ret;
}
+ down_read(&OCFS2_I(inode)->ip_xattr_sem);
ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
name, buffer, buffer_size);
+ up_read(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 0);
@@ -1374,11 +1385,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
memset(bh->b_data + cp_len, 0,
blocksize - cp_len);
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
+ ocfs2_journal_dirty(handle, bh);
brelse(bh);
bh = NULL;
@@ -1622,7 +1629,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
/* Now tell xh->xh_entries about it */
for (i = 0; i < count; i++) {
offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
- if (offset < namevalue_offset)
+ if (offset <= namevalue_offset)
le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
namevalue_size);
}
@@ -2148,15 +2155,19 @@ alloc_value:
orig_clusters = ocfs2_xa_value_clusters(loc);
rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
if (rc < 0) {
- /*
- * If we tried to grow an existing external value,
- * ocfs2_xa_cleanuP-value_truncate() is going to
- * let it stand. We have to restore its original
- * value size.
- */
- loc->xl_entry->xe_value_size = orig_value_size;
+ ctxt->set_abort = 1;
ocfs2_xa_cleanup_value_truncate(loc, "growing",
orig_clusters);
+ /*
+ * If we were growing an existing value,
+ * ocfs2_xa_cleanup_value_truncate() won't remove
+ * the entry. We need to restore the original value
+ * size.
+ */
+ if (loc->xl_entry) {
+ BUG_ON(!orig_value_size);
+ loc->xl_entry->xe_value_size = orig_value_size;
+ }
mlog_errno(rc);
}
}
@@ -2371,16 +2382,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
}
ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
- if (ret < 0) {
- mlog_errno(ret);
- break;
- }
ocfs2_commit_trans(osb, ctxt.handle);
if (ctxt.meta_ac) {
ocfs2_free_alloc_context(ctxt.meta_ac);
ctxt.meta_ac = NULL;
}
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
}
if (ctxt.meta_ac)
@@ -2479,7 +2492,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
blk = le64_to_cpu(xb->xb_blkno);
bit = le16_to_cpu(xb->xb_suballoc_bit);
- bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+ if (xb->xb_suballoc_loc)
+ bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+ else
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
xb_alloc_inode = ocfs2_get_system_file_inode(osb,
EXTENT_ALLOC_SYSTEM_INODE,
@@ -2593,10 +2609,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, di_bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
@@ -2724,9 +2739,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
- ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(ctxt->handle, di_bh);
out:
return ret;
@@ -2745,7 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
{
int ret;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
struct ocfs2_xa_loc loc;
if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2753,13 +2765,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
down_write(&oi->ip_alloc_sem);
if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
- if (!ocfs2_xattr_has_space_inline(inode, di)) {
- ret = -ENOSPC;
- goto out;
- }
- }
-
- if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
if (ret) {
if (ret != -ENOSPC)
@@ -2846,9 +2851,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
int ret;
u16 suballoc_bit_start;
u32 num_got;
- u64 first_blkno;
+ u64 suballoc_loc, first_blkno;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *new_bh = NULL;
struct ocfs2_xattr_block *xblk;
@@ -2859,15 +2863,21 @@ static int ocfs2_create_xattr_block(struct inode *inode,
goto end;
}
- ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
- &suballoc_bit_start, &num_got,
- &first_blkno);
+ ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
+ &suballoc_loc, &suballoc_bit_start,
+ &num_got, &first_blkno);
if (ret < 0) {
mlog_errno(ret);
goto end;
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto end;
+ }
+
ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
@@ -2883,8 +2893,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
memset(xblk, 0, inode->i_sb->s_blocksize);
strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+ xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
- xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+ xblk->xb_fs_generation =
+ cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
xblk->xb_blkno = cpu_to_le64(first_blkno);
if (indexed) {
struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2968,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
ret = ocfs2_xa_set(&loc, xi, ctxt);
if (!ret)
xs->here = loc.xl_entry;
- else if (ret != -ENOSPC)
+ else if ((ret != -ENOSPC) || ctxt->set_abort)
goto end;
else {
ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3041,8 +3053,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
clusters_add += new_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
}
goto meta_guess;
@@ -3107,8 +3118,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (!ocfs2_xattr_is_local(xe))
credits += ocfs2_calc_extend_credits(
inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
goto out;
}
}
@@ -3133,9 +3143,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
clusters_add += new_clusters - old_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &xv->xr_list,
- new_clusters -
- old_clusters);
+ &xv->xr_list);
if (value_size >= OCFS2_XATTR_ROOT_SIZE)
goto out;
}
@@ -3181,7 +3189,7 @@ meta_guess:
&xb->xb_attrs.xb_root.xt_list;
meta_add += ocfs2_extend_meta_needed(el);
credits += ocfs2_calc_extend_credits(inode->i_sb,
- el, 1);
+ el);
} else
credits += OCFS2_SUBALLOC_ALLOC + 1;
@@ -3200,8 +3208,15 @@ meta_guess:
clusters_add += 1;
}
} else {
- meta_add += 1;
credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el);
+ } else {
+ meta_add += 1;
+ }
}
out:
if (clusters_need)
@@ -3238,8 +3253,8 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
}
meta_add += extra_meta;
- mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
- "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
+ trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+ clusters_add, *credits);
if (meta_add) {
ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -3312,14 +3327,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
- } else if (ret == -ENOSPC) {
+ } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
if (di->i_xattr_loc && !xbs->xattr_bh) {
ret = ocfs2_xattr_block_find(inode,
xi->xi_name_index,
@@ -3343,8 +3357,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3378,8 +3391,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- ret = ocfs2_extend_trans(ctxt->handle, credits +
- ctxt->handle->h_buffer_credits);
+ ret = ocfs2_extend_trans(ctxt->handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3509,7 +3521,7 @@ int ocfs2_xattr_set(struct inode *inode,
int ret, credits, ref_meta = 0, ref_credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct inode *tl_inode = osb->osb_tl_inode;
- struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
struct ocfs2_refcount_tree *ref_tree = NULL;
struct ocfs2_xattr_info xi = {
@@ -3552,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode,
down_write(&OCFS2_I(inode)->ip_xattr_sem);
/*
* Scan inode and external block to find the same name
- * extended attribute and collect search infomation.
+ * extended attribute and collect search information.
*/
ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
if (ret)
@@ -3576,7 +3588,7 @@ int ocfs2_xattr_set(struct inode *inode,
goto cleanup;
}
- /* Check whether the value is refcounted and do some prepartion. */
+ /* Check whether the value is refcounted and do some preparation. */
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
(!xis.not_found || !xbs.not_found)) {
ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
@@ -3613,13 +3625,15 @@ int ocfs2_xattr_set(struct inode *inode,
if (IS_ERR(ctxt.handle)) {
ret = PTR_ERR(ctxt.handle);
mlog_errno(ret);
- goto cleanup;
+ goto out_free_ac;
}
ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+ ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
ocfs2_commit_trans(osb, ctxt.handle);
+out_free_ac:
if (ctxt.data_ac)
ocfs2_free_alloc_context(ctxt.data_ac);
if (ctxt.meta_ac)
@@ -3882,8 +3896,10 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
if (found) {
xs->here = &xs->header->xh_entries[index];
- mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
- (unsigned long long)bucket_blkno(xs->bucket), index);
+ trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+ name, name_index, name_hash,
+ (unsigned long long)bucket_blkno(xs->bucket),
+ index);
} else
ret = -ENODATA;
@@ -3910,8 +3926,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
if (le16_to_cpu(el->l_next_free_rec) == 0)
return -ENODATA;
- mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
- name, name_hash, name_index);
+ trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+ name, name_index, name_hash,
+ (unsigned long long)root_bh->b_blocknr,
+ -1);
ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
&num_clusters, el);
@@ -3922,9 +3940,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
- mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
- "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
- first_hash);
+ trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+ name, name_index, first_hash,
+ (unsigned long long)p_blkno,
+ num_clusters);
ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
p_blkno, first_hash, num_clusters, xs);
@@ -3950,8 +3969,9 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
return -ENOMEM;
}
- mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
- clusters, (unsigned long long)blkno);
+ trace_ocfs2_iterate_xattr_buckets(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno, clusters);
for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
ret = ocfs2_read_xattr_bucket(bucket, blkno);
@@ -3967,8 +3987,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
if (i == 0)
num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
- mlog(0, "iterating xattr bucket %llu, first hash %u\n",
- (unsigned long long)blkno,
+ trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
if (func) {
ret = func(inode, bucket, para);
@@ -4168,9 +4187,9 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
char *src = xb_bh->b_data;
char *target = bucket_block(bucket, blks - 1);
- mlog(0, "cp xattr from block %llu to bucket %llu\n",
- (unsigned long long)xb_bh->b_blocknr,
- (unsigned long long)bucket_blkno(bucket));
+ trace_ocfs2_cp_xattr_block_to_bucket_begin(
+ (unsigned long long)xb_bh->b_blocknr,
+ (unsigned long long)bucket_blkno(bucket));
for (i = 0; i < blks; i++)
memset(bucket_block(bucket, i), 0, blocksize);
@@ -4206,8 +4225,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
for (i = 0; i < count; i++)
le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
- mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
- offset, size, off_change);
+ trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
cmp_xe, swap_xe);
@@ -4249,7 +4267,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
u32 bit_off, len;
u64 blkno;
handle_t *handle = ctxt->handle;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct buffer_head *xb_bh = xs->xattr_bh;
struct ocfs2_xattr_block *xb =
@@ -4257,8 +4274,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
struct ocfs2_xattr_tree_root *xr;
u16 xb_flags = le16_to_cpu(xb->xb_flags);
- mlog(0, "create xattr index block for %llu\n",
- (unsigned long long)xb_bh->b_blocknr);
+ trace_ocfs2_xattr_create_index_block_begin(
+ (unsigned long long)xb_bh->b_blocknr);
BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
BUG_ON(!xs->bucket);
@@ -4277,7 +4294,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
goto out;
}
- ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
1, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
@@ -4291,10 +4308,9 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
*/
blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
- mlog(0, "allocate 1 cluster from %llu to xattr block\n",
- (unsigned long long)blkno);
+ trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
- ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4396,8 +4412,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
entries = (char *)xh->xh_entries;
xh_free_start = le16_to_cpu(xh->xh_free_start);
- mlog(0, "adjust xattr bucket in %llu, count = %u, "
- "xh_free_start = %u, xh_name_value_len = %u.\n",
+ trace_ocfs2_defrag_xattr_bucket(
(unsigned long long)blkno, le16_to_cpu(xh->xh_count),
xh_free_start, le16_to_cpu(xh->xh_name_value_len));
@@ -4499,8 +4514,9 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
- mlog(0, "move half of xattrs in cluster %llu to %llu\n",
- (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
+ trace_ocfs2_mv_xattr_bucket_cross_cluster(
+ (unsigned long long)last_cluster_blkno,
+ (unsigned long long)new_blkno);
ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
last_cluster_blkno, new_blkno,
@@ -4610,8 +4626,8 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
struct ocfs2_xattr_entry *xe;
int blocksize = inode->i_sb->s_blocksize;
- mlog(0, "move some of xattrs from bucket %llu to %llu\n",
- (unsigned long long)blk, (unsigned long long)new_blk);
+ trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+ (unsigned long long)new_blk);
s_bucket = ocfs2_xattr_bucket_new(inode);
t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4638,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
* Even if !new_bucket_head, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4710,9 +4726,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
*/
xe = &xh->xh_entries[start];
len = sizeof(struct ocfs2_xattr_entry) * (count - start);
- mlog(0, "mv xattr entry len %d from %d to %d\n", len,
- (int)((char *)xe - (char *)xh),
- (int)((char *)xh->xh_entries - (char *)xh));
+ trace_ocfs2_divide_xattr_bucket_move(len,
+ (int)((char *)xe - (char *)xh),
+ (int)((char *)xh->xh_entries - (char *)xh));
memmove((char *)xh->xh_entries, (char *)xe, len);
xe = &xh->xh_entries[count - start];
len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -4784,9 +4800,9 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
BUG_ON(s_blkno == t_blkno);
- mlog(0, "cp bucket %llu to %llu, target is %d\n",
- (unsigned long long)s_blkno, (unsigned long long)t_blkno,
- t_is_new);
+ trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+ (unsigned long long)t_blkno,
+ t_is_new);
s_bucket = ocfs2_xattr_bucket_new(inode);
t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4804,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
* Even if !t_is_new, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
if (ret)
goto out;
@@ -4858,8 +4874,8 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
struct ocfs2_xattr_bucket *old_first, *new_first;
- mlog(0, "mv xattrs from cluster %llu to %llu\n",
- (unsigned long long)last_blk, (unsigned long long)to_blk);
+ trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+ (unsigned long long)to_blk);
BUG_ON(start_bucket >= num_buckets);
if (start_bucket) {
@@ -4887,8 +4903,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
* We need to update the first bucket of the old extent and all
* the buckets going to the new extent.
*/
- credits = ((num_buckets + 1) * blks_per_bucket) +
- handle->h_buffer_credits;
+ credits = ((num_buckets + 1) * blks_per_bucket);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -4958,7 +4973,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
u32 *first_hash)
{
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+ int ret, credits = 2 * blk_per_bucket;
BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -5010,9 +5025,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
{
int ret;
- mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
- (unsigned long long)bucket_blkno(first), prev_clusters,
- (unsigned long long)new_blk);
+ trace_ocfs2_adjust_xattr_cross_cluster(
+ (unsigned long long)bucket_blkno(first),
+ (unsigned long long)new_blk, prev_clusters);
if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -5085,10 +5100,10 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
- mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
- "previous xattr blkno = %llu\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- prev_cpos, (unsigned long long)bucket_blkno(first));
+ trace_ocfs2_add_new_xattr_cluster_begin(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)bucket_blkno(first),
+ prev_cpos, prev_clusters);
ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
@@ -5099,7 +5114,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
goto leave;
}
- ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+ ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -5110,8 +5125,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
BUG_ON(num_bits > clusters_to_add);
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
(prev_clusters + num_bits) << osb->s_clustersize_bits <=
@@ -5127,8 +5141,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
*/
v_start = prev_cpos + prev_clusters;
*num_clusters = prev_clusters + num_bits;
- mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
- num_bits);
} else {
ret = ocfs2_adjust_xattr_cross_cluster(inode,
handle,
@@ -5144,8 +5156,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
}
}
- mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
- num_bits, (unsigned long long)block, v_start);
+ trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+ v_start, num_bits);
ret = ocfs2_insert_extent(handle, &et, v_start, block,
num_bits, 0, ctxt->meta_ac);
if (ret < 0) {
@@ -5153,9 +5165,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
goto leave;
}
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret < 0)
- mlog_errno(ret);
+ ocfs2_journal_dirty(handle, root_bh);
leave:
return ret;
@@ -5182,9 +5192,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
u64 end_blk;
u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
- mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
- "from %llu, len = %u\n", (unsigned long long)target_blk,
- (unsigned long long)bucket_blkno(first), num_clusters);
+ trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+ (unsigned long long)bucket_blkno(first),
+ num_clusters, new_bucket);
/* The extent must have room for an additional bucket */
BUG_ON(new_bucket >=
@@ -5200,8 +5210,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
* existing bucket. Then we add the last existing bucket, the
* new bucket, and the first bucket (3 * blk_per_bucket).
*/
- credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
- handle->h_buffer_credits;
+ credits = (end_blk - target_blk) + (3 * blk_per_bucket);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -5265,8 +5274,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
/* The bucket at the front of the extent */
struct ocfs2_xattr_bucket *first;
- mlog(0, "Add new xattr bucket starting from %llu\n",
- (unsigned long long)bucket_blkno(target));
+ trace_ocfs2_add_new_xattr_bucket(
+ (unsigned long long)bucket_blkno(target));
/* The first bucket of the original extent */
first = ocfs2_xattr_bucket_new(inode);
@@ -5382,8 +5391,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
* modified something. We have to assume they did, and dirty
* the whole bucket. This leaves us in a consistent state.
*/
- mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
- xe_off, (unsigned long long)bucket_blkno(bucket), len);
+ trace_ocfs2_xattr_bucket_value_truncate(
+ (unsigned long long)bucket_blkno(bucket), xe_off, len);
ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
if (ret) {
mlog_errno(ret);
@@ -5433,8 +5442,9 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
ocfs2_init_dealloc_ctxt(&dealloc);
- mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
- cpos, len, (unsigned long long)blkno);
+ trace_ocfs2_rm_xattr_cluster(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)blkno, cpos, len);
ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
len);
@@ -5477,16 +5487,12 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
}
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
-
- ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_journal_dirty(handle, root_bh);
ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
if (ret)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -5543,7 +5549,7 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
int ret;
struct ocfs2_xa_loc loc;
- mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
+ trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
xs->not_found ? NULL : xs->here);
@@ -5575,7 +5581,6 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
out:
- mlog_exit(ret);
return ret;
}
@@ -5586,7 +5591,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
{
int ret;
- mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
+ trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
if (!ret)
@@ -5642,7 +5647,6 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
mlog_errno(ret);
out:
- mlog_exit(ret);
return ret;
}
@@ -5896,6 +5900,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
&num_clusters, el, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
cpos += num_clusters;
if ((ext_flags & OCFS2_EXT_REFCOUNTED))
@@ -6046,9 +6054,9 @@ static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
p = &refcount;
- mlog(0, "refcount bucket %llu, count = %u\n",
- (unsigned long long)bucket_blkno(bucket),
- le16_to_cpu(xh->xh_count));
+ trace_ocfs2_xattr_bucket_value_refcount(
+ (unsigned long long)bucket_blkno(bucket),
+ le16_to_cpu(xh->xh_count));
for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
xe = &xh->xh_entries[i];
@@ -6226,8 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
le16_to_cpu(xv->xr_list.l_next_free_rec);
*credits += ocfs2_calc_extend_credits(sb,
- &def_xv.xv.xr_list,
- le32_to_cpu(xv->xr_clusters));
+ &def_xv.xv.xr_list);
/*
* If the value is a tree with depth > 1, We don't go deep
@@ -6344,8 +6351,8 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
u32 clusters, cpos, p_cluster, num_clusters;
unsigned int ext_flags = 0;
- mlog(0, "reflink xattr in container %llu, count = %u\n",
- (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+ trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+ le16_to_cpu(xh->xh_count));
last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
@@ -6506,6 +6513,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
}
new_oi = OCFS2_I(args->new_inode);
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
spin_lock(&new_oi->ip_lock);
new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
@@ -6528,13 +6545,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
int indexed)
{
int ret;
- struct ocfs2_alloc_context *meta_ac;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_xattr_set_ctxt ctxt = {
- .meta_ac = meta_ac,
- };
+ struct ocfs2_xattr_set_ctxt ctxt;
- ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ memset(&ctxt, 0, sizeof(ctxt));
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -6547,8 +6562,8 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
goto out;
}
- mlog(0, "create new xattr block for inode %llu, index = %d\n",
- (unsigned long long)fe_bh->b_blocknr, indexed);
+ trace_ocfs2_create_empty_xattr_block(
+ (unsigned long long)fe_bh->b_blocknr, indexed);
ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
ret_bh);
if (ret)
@@ -6556,7 +6571,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
ocfs2_commit_trans(osb, ctxt.handle);
out:
- ocfs2_free_alloc_context(meta_ac);
+ ocfs2_free_alloc_context(ctxt.meta_ac);
return ret;
}
@@ -6784,7 +6799,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
*credits += ocfs2_calc_extend_credits(osb->sb,
- xt_et->et_root_el, len);
+ xt_et->et_root_el);
if (metas.num_metas) {
ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
@@ -6804,23 +6819,22 @@ out:
if (ret) {
if (*meta_ac) {
ocfs2_free_alloc_context(*meta_ac);
- meta_ac = NULL;
+ *meta_ac = NULL;
}
}
return ret;
}
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
u64 blkno, u64 new_blkno, u32 clusters,
+ u32 *cpos, int num_buckets,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_reflink_xattr_tree_args *args)
{
int i, j, ret = 0;
struct super_block *sb = args->reflink->old_inode->i_sb;
- u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
- u32 num_buckets = clusters * bpc;
int bpb = args->old_bucket->bu_blocks;
struct ocfs2_xattr_value_buf vb = {
.vb_access = ocfs2_journal_access,
@@ -6833,20 +6847,12 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
break;
}
- ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+ ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
if (ret) {
mlog_errno(ret);
break;
}
- /*
- * The real bucket num in this series of blocks is stored
- * in the 1st bucket.
- */
- if (i == 0)
- num_buckets = le16_to_cpu(
- bucket_xh(args->old_bucket)->xh_num_buckets);
-
ret = ocfs2_xattr_bucket_journal_access(handle,
args->new_bucket,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6860,6 +6866,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
bucket_block(args->old_bucket, j),
sb->s_blocksize);
+ /*
+ * Record the start cpos so that we can use it to initialize
+ * our xattr tree we also set the xh_num_bucket for the new
+ * bucket.
+ */
+ if (i == 0) {
+ *cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+ xh_entries[0].xe_name_hash);
+ bucket_xh(args->new_bucket)->xh_num_buckets =
+ cpu_to_le16(num_buckets);
+ }
+
ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6889,6 +6907,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
}
ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
ocfs2_xattr_bucket_relse(args->old_bucket);
ocfs2_xattr_bucket_relse(args->new_bucket);
}
@@ -6897,6 +6916,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
ocfs2_xattr_bucket_relse(args->new_bucket);
return ret;
}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_reflink_xattr_tree_args *args,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac,
+ u64 blkno, u32 cpos, u32 len)
+{
+ int ret, first_inserted = 0;
+ u32 p_cluster, num_clusters, reflink_cpos = 0;
+ u64 new_blkno;
+ unsigned int num_buckets, reflink_buckets;
+ unsigned int bpc =
+ ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+ ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+ ocfs2_xattr_bucket_relse(args->old_bucket);
+
+ while (len && num_buckets) {
+ ret = ocfs2_claim_clusters(handle, data_ac,
+ 1, &p_cluster, &num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+ ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+ new_blkno, num_clusters,
+ &reflink_cpos, reflink_buckets,
+ meta_ac, data_ac, args);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * For the 1st allocated cluster, we make it use the same cpos
+ * so that the xattr tree looks the same as the original one
+ * in the most case.
+ */
+ if (!first_inserted) {
+ reflink_cpos = cpos;
+ first_inserted = 1;
+ }
+ ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+ num_clusters, 0, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+
+ trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+ num_clusters, reflink_cpos);
+
+ len -= num_clusters;
+ blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+ num_buckets -= reflink_buckets;
+ }
+out:
+ return ret;
+}
+
/*
* Create the same xattr extent record in the new inode's xattr tree.
*/
@@ -6908,8 +6996,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
void *para)
{
int ret, credits = 0;
- u32 p_cluster, num_clusters;
- u64 new_blkno;
handle_t *handle;
struct ocfs2_reflink_xattr_tree_args *args =
(struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6918,6 +7004,8 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_extent_tree et;
+ trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
+
ocfs2_init_xattr_tree_extent_tree(&et,
INODE_CACHE(args->reflink->new_inode),
args->new_blk_bh);
@@ -6937,32 +7025,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
goto out;
}
- ret = ocfs2_claim_clusters(osb, handle, data_ac,
- len, &p_cluster, &num_clusters);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-
- mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
- (unsigned long long)blkno, (unsigned long long)new_blkno, len);
- ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
- meta_ac, data_ac, args);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
- (unsigned long long)new_blkno, len, cpos);
- ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
- len, 0, meta_ac);
+ ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+ meta_ac, data_ac,
+ blkno, cpos, len);
if (ret)
mlog_errno(ret);
-out_commit:
ocfs2_commit_trans(osb, handle);
out:
@@ -7034,7 +7102,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
goto out;
}
- if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+ if (!indexed)
ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
else
ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
@@ -7138,24 +7206,16 @@ out:
* must not hold any lock expect i_mutex.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
- struct inode *inode)
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl)
{
- int ret = 0;
struct buffer_head *dir_bh = NULL;
- struct ocfs2_security_xattr_info si = {
- .enable = 1,
- };
+ int ret = 0;
- ret = ocfs2_init_security_get(inode, dir, &si);
- if (!ret) {
- ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
- si.name, si.value, si.value_len,
- XATTR_CREATE);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
- } else if (ret != -EOPNOTSUPP) {
+ ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
+ if (ret) {
mlog_errno(ret);
goto leave;
}
@@ -7166,9 +7226,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}
- ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
- if (ret)
- mlog_errno(ret);
+ if (!ret && default_acl)
+ ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (!ret && acl)
+ ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
@@ -7212,15 +7273,37 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
+int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, XATTR_CREATE);
+ if (err)
+ break;
+ }
+ return err;
+}
+
int ocfs2_init_security_get(struct inode *inode,
struct inode *dir,
+ const struct qstr *qstr,
struct ocfs2_security_xattr_info *si)
{
/* check whether ocfs2 support feature xattr */
if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
return -EOPNOTSUPP;
- return security_inode_init_security(inode, dir, &si->name, &si->value,
- &si->value_len);
+ if (si)
+ return security_old_inode_init_security(inode, dir, qstr,
+ &si->name, &si->value,
+ &si->value_len);
+
+ return security_inode_init_security(inode, dir, qstr,
+ &ocfs2_initxattrs, NULL);
}
int ocfs2_init_security_set(handle_t *handle,
@@ -7236,7 +7319,7 @@ int ocfs2_init_security_set(handle_t *handle,
xattr_ac, data_ac);
}
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
@@ -7280,7 +7363,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
@@ -7336,7 +7419,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f52..f10d5b93c36 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,17 +32,15 @@ enum ocfs2_xattr_type {
struct ocfs2_security_xattr_info {
int enable;
- char *name;
+ const char *name;
void *value;
size_t value_len;
};
-extern struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
@@ -57,6 +55,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
struct ocfs2_dinode *di);
int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
int ocfs2_init_security_get(struct inode *, struct inode *,
+ const struct qstr *,
struct ocfs2_security_xattr_info *);
int ocfs2_init_security_set(handle_t *, struct inode *,
struct buffer_head *,
@@ -67,7 +66,7 @@ int ocfs2_calc_security_init(struct inode *,
struct ocfs2_security_xattr_info *,
int *, int *, struct ocfs2_alloc_context **);
int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
- int, struct ocfs2_security_xattr_info *,
+ umode_t, struct ocfs2_security_xattr_info *,
int *, int *, int *);
/*
@@ -94,5 +93,8 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
struct buffer_head *new_bh,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
- struct inode *inode);
+ struct inode *inode,
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl);
#endif /* OCFS2_XATTR_H */