[PATCH] OCFS2: The Second Oracle Cluster Filesystem

The OCFS2 file system module. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
author: Mark Fasheh <mark.fasheh@oracle.com> 2005-12-15 14:31:24 -0800
committer: Joel Becker <joel.becker@oracle.com> 2006-01-03 11:45:47 -0800
commit: ccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch)
tree: c50ed941849ce06ccadd4ce27599b3ef9fdbe2ae /fs/ocfs2/namei.c
parent: 8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff)
1 files changed, 2264 insertions, 0 deletions
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 00000000000..f6b77ff1d2b
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * namei.c
+ *
+ * Create and rename file, directory, symlinks
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ *  Portions of this code from linux/fs/ext3/dir.c
+ *
+ *  Copyright (C) 1992, 1993, 1994, 1995
+ *  Remy Card (card@masi.ibp.fr)
+ *  Laboratoire MASI - Institut Blaise pascal
+ *  Universite Pierre et Marie Curie (Paris VI)
+ *
+ *   from
+ *
+ *   linux/fs/minix/dir.c
+ *
+ *   Copyright (C) 1991, 1992 Linux Torvalds
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_NAMEI
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dcache.h"
+#include "dir.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "namei.h"
+#include "suballoc.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "uptodate.h"
+#include "vote.h"
+
+#include "buffer_head_io.h"
+
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir);
+
+static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
+			      struct inode *dir,
+			      struct ocfs2_dir_entry *de_del,
+			      struct buffer_head *bh);
+
+static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+			     struct inode *dir,
+			     const char *name, int namelen,
+			     struct inode *inode, u64 blkno,
+			     struct buffer_head *parent_fe_bh,
+			     struct buffer_head *insert_bh);
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac);
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac);
+
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2);
+
+static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
+				    struct ocfs2_journal_handle *handle,
+				    struct inode *inode,
+				    char *name,
+				    struct buffer_head **de_bh);
+
+static int ocfs2_orphan_add(struct ocfs2_super *osb,
+			    struct ocfs2_journal_handle *handle,
+			    struct inode *inode,
+			    struct ocfs2_dinode *fe,
+			    char *name,
+			    struct buffer_head *de_bh);
+
+static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
+				     struct ocfs2_journal_handle *handle,
+				     struct inode *inode,
+				     const char *symname);
+
+static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct buffer_head *insert_bh)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, insert_bh);
+}
+
+/* An orphan dir name is an 8 byte value, printed as a hex string */
+#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+
+static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	int status;
+	u64 blkno;
+	struct buffer_head *dirent_bh = NULL;
+	struct inode *inode = NULL;
+	struct dentry *ret;
+	struct ocfs2_dir_entry *dirent;
+	struct ocfs2_inode_info *oi;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
+		ret = ERR_PTR(-ENAMETOOLONG);
+		goto bail;
+	}
+
+	mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
+	     dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
+
+	status = ocfs2_meta_lock(dir, NULL, NULL, 0);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		ret = ERR_PTR(status);
+		goto bail;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0)
+		goto bail_add;
+
+	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+	if (IS_ERR(inode)) {
+		mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
+		ret = ERR_PTR(-EACCES);
+		goto bail_unlock;
+	}
+
+	oi = OCFS2_I(inode);
+	/* Clear any orphaned state... If we were able to look up the
+	 * inode from a directory, it certainly can't be orphaned. We
+	 * might have the bad state from a node which intended to
+	 * orphan this inode but crashed before it could commit the
+	 * unlink. */
+	spin_lock(&oi->ip_lock);
+	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
+	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&oi->ip_lock);
+
+bail_add:
+
+	dentry->d_op = &ocfs2_dentry_ops;
+	ret = d_splice_alias(inode, dentry);
+
+bail_unlock:
+	/* Don't drop the cluster lock until *after* the d_add --
+	 * unlink on another node will message us to remove that
+	 * dentry under this lock so otherwise we can race this with
+	 * the vote thread and have a stale dentry. */
+	ocfs2_meta_unlock(dir, 0);
+
+bail:
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	mlog_exit_ptr(ret);
+
+	return ret;
+}
+
+static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode *parent,
+			      struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      struct ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+	status = ocfs2_journal_access(handle, inode, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mknod(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode,
+		       dev_t dev)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dirfe;
+	struct buffer_head *new_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct inode *inode = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	/* get our super block */
+	osb = OCFS2_SB(dir->i_sb);
+
+	if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+		mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
+		     OCFS2_I(dir)->ip_blkno, dir->i_nlink);
+		status = -EMLINK;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	/* are we making a directory? If so, reserve a cluster for his
+	 * 1st extent. */
+	if (S_ISDIR(mode)) {
+		status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+				    &new_fe_bh, parent_fe_bh, handle,
+				    &inode, inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(mode)) {
+		status = ocfs2_fill_new_dir(osb, handle, dir, inode,
+					    new_fe_bh, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+
+		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		le16_add_cpu(&dirfe->i_links_count, 1);
+		status = ocfs2_journal_dirty(handle, parent_fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+		dir->i_nlink++;
+	}
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 de_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+leave:
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if (new_fe_bh)
+		brelse(new_fe_bh);
+
+	if (de_bh)
+		brelse(de_bh);
+
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	if ((status < 0) && inode)
+		iput(inode);
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+static int ocfs2_mknod_locked(struct ocfs2_super *osb,
+			      struct inode *dir,
+			      struct dentry *dentry, int mode,
+			      dev_t dev,
+			      struct buffer_head **new_fe_bh,
+			      struct buffer_head *parent_fe_bh,
+			      struct ocfs2_journal_handle *handle,
+			      struct inode **ret_inode,
+			      struct ocfs2_alloc_context *inode_ac)
+{
+	int status = 0;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_extent_list *fel;
+	u64 fe_blkno = 0;
+	u16 suballoc_bit;
+	struct inode *inode = NULL;
+
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+		   (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	*new_fe_bh = NULL;
+	*ret_inode = NULL;
+
+	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
+				       &fe_blkno);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	inode = new_inode(dir->i_sb);
+	if (IS_ERR(inode)) {
+		status = PTR_ERR(inode);
+		mlog(ML_ERROR, "new_inode failed!\n");
+		goto leave;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
+	OCFS2_I(inode)->ip_blkno = fe_blkno;
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_mode = mode;
+	spin_lock(&osb->osb_lock);
+	inode->i_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
+	if (!*new_fe_bh) {
+		status = -EIO;
+		mlog_errno(status);
+		goto leave;
+	}
+	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
+
+	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
+	memset(fe, 0, osb->sb->s_blocksize);
+
+	fe->i_generation = cpu_to_le32(inode->i_generation);
+	fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
+	fe->i_blkno = cpu_to_le64(fe_blkno);
+	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
+	fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+	fe->i_uid = cpu_to_le32(current->fsuid);
+	if (dir->i_mode & S_ISGID) {
+		fe->i_gid = cpu_to_le32(dir->i_gid);
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		fe->i_gid = cpu_to_le32(current->fsgid);
+	fe->i_mode = cpu_to_le16(mode);
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
+
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+	fe->i_last_eb_blk = 0;
+	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
+	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_atime = fe->i_ctime = fe->i_mtime =
+		cpu_to_le64(CURRENT_TIME.tv_sec);
+	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
+		cpu_to_le32(CURRENT_TIME.tv_nsec);
+	fe->i_dtime = 0;
+
+	fel = &fe->id2.i_list;
+	fel->l_tree_depth = 0;
+	fel->l_next_free_rec = 0;
+	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+
+	status = ocfs2_journal_dirty(handle, *new_fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
+		     "i_blkno=%"MLFu64", i_ino=%lu\n",
+		     (unsigned long long) (*new_fe_bh)->b_blocknr,
+		     fe->i_blkno, inode->i_ino);
+		BUG();
+	}
+
+	ocfs2_inode_set_new(osb, inode);
+	status = ocfs2_create_new_inode_locks(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+	status = 0; /* error in ocfs2_create_new_inode_locks is not
+		     * critical */
+
+	*ret_inode = inode;
+leave:
+	if (status < 0) {
+		if (*new_fe_bh) {
+			brelse(*new_fe_bh);
+			*new_fe_bh = NULL;
+		}
+		if (inode)
+			iput(inode);
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mkdir(struct inode *dir,
+		       struct dentry *dentry,
+		       int mode)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int ret;
+
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+		   dentry->d_name.len, dentry->d_name.name);
+	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
+	mlog_exit(ret);
+
+	return ret;
+}
+
+static int ocfs2_link(struct dentry *old_dentry,
+		      struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct ocfs2_journal_handle *handle = NULL;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *de_bh = NULL;
+	struct ocfs2_dinode *fe = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	if (S_ISDIR(inode->i_mode)) {
+		err = -EPERM;
+		goto bail;
+	}
+
+	if (inode->i_nlink >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		err = -ENOMEM;
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					dentry->d_name.len);
+	if (err)
+		goto bail;
+
+	err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					   dentry->d_name.name,
+					   dentry->d_name.len, &de_bh);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (err < 0) {
+		if (err != -ENOENT)
+			mlog_errno(err);
+		goto bail;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+		err = -EMLINK;
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_journal_access(handle, inode, fe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (err < 0) {
+		mlog_errno(err);
+		goto bail;
+	}
+
+	inode->i_nlink++;
+	inode->i_ctime = CURRENT_TIME;
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	err = ocfs2_journal_dirty(handle, fe_bh);
+	if (err < 0) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	err = ocfs2_add_entry(handle, dentry, inode,
+			      OCFS2_I(inode)->ip_blkno,
+			      parent_fe_bh, de_bh);
+	if (err) {
+		le16_add_cpu(&fe->i_links_count, -1);
+		inode->i_nlink--;
+		mlog_errno(err);
+		goto bail;
+	}
+
+	atomic_inc(&inode->i_count);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+bail:
+	if (handle)
+		ocfs2_commit_trans(handle);
+	if (de_bh)
+		brelse(de_bh);
+	if (fe_bh)
+		brelse(fe_bh);
+	if (parent_fe_bh)
+		brelse(parent_fe_bh);
+
+	mlog_exit(err);
+
+	return err;
+}
+
+static int ocfs2_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	unsigned int saved_nlink = 0;
+	struct inode *inode = dentry->d_inode;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	u64 blkno;
+	struct ocfs2_dinode *fe = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct buffer_head *parent_node_bh = NULL;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+	struct buffer_head *dirent_bh = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+
+	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	BUG_ON(dentry->d_parent->d_inode != dir);
+
+	mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
+
+	if (inode == osb->root_inode) {
+		mlog(0, "Cannot delete the root directory\n");
+		status = -EPERM;
+		goto leave;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_find_files_on_disk(dentry->d_name.name,
+					  dentry->d_name.len, &blkno,
+					  dir, &dirent_bh, &dirent);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (OCFS2_I(inode)->ip_blkno != blkno) {
+		status = -ENOENT;
+
+		mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
+		     "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
+		     OCFS2_I(inode)->ip_flags);
+		goto leave;
+	}
+
+	status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+	       	if (!ocfs2_empty_dir(inode)) {
+			status = -ENOTEMPTY;
+			goto leave;
+		} else if (inode->i_nlink != 2) {
+			status = -ENOTEMPTY;
+			goto leave;
+		}
+	}
+
+	/* There are still a few steps left until we can consider the
+	 * unlink to have succeeded. Save off nlink here before
+	 * modification so we can set it back in case we hit an issue
+	 * before commit. */
+	saved_nlink = inode->i_nlink;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 0;
+	else
+		inode->i_nlink--;
+
+	status = ocfs2_request_unlink_vote(inode, dentry,
+					   (unsigned int) inode->i_nlink);
+	if (status < 0) {
+		/* This vote should succeed under all normal
+		 * circumstances. */
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (!inode->i_nlink) {
+		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
+						  orphan_name,
+						  &orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+	if (!inode->i_nlink) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+					  orphan_entry_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* delete the name from the parent dir */
+	status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We can set nlink on the dinode now. clear the saved version
+	 * so that it doesn't get set later. */
+	fe->i_links_count = cpu_to_le16(inode->i_nlink);
+	saved_nlink = 0;
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		dir->i_nlink--;
+		status = ocfs2_mark_inode_dirty(handle, dir,
+						parent_node_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			dir->i_nlink++;
+		}
+	}
+
+leave:
+	if (status < 0 && saved_nlink)
+		inode->i_nlink = saved_nlink;
+
+	if (handle)
+		ocfs2_commit_trans(handle);
+
+	if (fe_bh)
+		brelse(fe_bh);
+
+	if (dirent_bh)
+		brelse(dirent_bh);
+
+	if (parent_node_bh)
+		brelse(parent_node_bh);
+
+	if (orphan_entry_bh)
+		brelse(orphan_entry_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
+/*
+ * The only place this should be used is rename!
+ * if they have the same id, then the 1st one is the only one locked.
+ */
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+			     struct ocfs2_journal_handle *handle,
+			     struct buffer_head **bh1,
+			     struct inode *inode1,
+			     struct buffer_head **bh2,
+			     struct inode *inode2)
+{
+	int status;
+	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
+	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
+	struct buffer_head **tmpbh;
+	struct inode *tmpinode;
+
+	mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
+		   oi1->ip_blkno, oi2->ip_blkno);
+
+	BUG_ON(!handle);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* we always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno != oi2->ip_blkno) {
+		if (oi1->ip_blkno < oi2->ip_blkno) {
+			/* switch id1 and id2 around */
+			mlog(0, "switching them around...\n");
+			tmpbh = bh2;
+			bh2 = bh1;
+			bh1 = tmpbh;
+
+			tmpinode = inode2;
+			inode2 = inode1;
+			inode1 = tmpinode;
+		}
+		/* lock id2 */
+		status = ocfs2_meta_lock(inode2, handle, bh2, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+	}
+	/* lock id1 */
+	status = ocfs2_meta_lock(inode1, handle, bh1, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto bail;
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+#define PARENT_INO(buffer) \
+	((struct ocfs2_dir_entry *) \
+	 ((char *)buffer + \
+	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
+
+static int ocfs2_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir,
+			struct dentry *new_dentry)
+{
+	int status = 0, rename_lock = 0;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct ocfs2_dinode *newfe = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *orphan_entry_bh = NULL;
+	struct buffer_head *newfe_bh = NULL;
+	struct buffer_head *insert_entry_bh = NULL;
+	struct ocfs2_super *osb = NULL;
+	u64 newfe_blkno;
+	struct ocfs2_journal_handle *handle = NULL;
+	struct buffer_head *old_dir_bh = NULL;
+	struct buffer_head *new_dir_bh = NULL;
+	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
+							       // and new_dentry
+	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
+	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
+						    // this is the 1st dirent bh
+	nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
+	unsigned int links_count;
+
+	/* At some point it might be nice to break this function up a
+	 * bit. */
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+		   old_dir, old_dentry, new_dir, new_dentry,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   new_dentry->d_name.len, new_dentry->d_name.name);
+
+	osb = OCFS2_SB(old_dir->i_sb);
+
+	if (new_inode) {
+		if (!igrab(new_inode))
+			BUG();
+	}
+
+	if (atomic_read(&old_dentry->d_count) > 2) {
+		shrink_dcache_parent(old_dentry);
+		if (atomic_read(&old_dentry->d_count) > 2) {
+			status = -EBUSY;
+			goto bail;
+		}
+	}
+
+	/* Assume a directory heirarchy thusly:
+	 * a/b/c
+	 * a/d
+	 * a,b,c, and d are all directories.
+	 *
+	 * from cwd of 'a' on both nodes:
+	 * node1: mv b/c d
+	 * node2: mv d   b/c
+	 *
+	 * And that's why, just like the VFS, we need a file system
+	 * rename lock. */
+	if (old_dentry != new_dentry) {
+		status = ocfs2_rename_lock(osb);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+		rename_lock = 1;
+	}
+
+	handle = ocfs2_alloc_handle(osb);
+	if (handle == NULL) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* if old and new are the same, this'll just do one lock. */
+	status = ocfs2_double_lock(osb, handle,
+				  &old_dir_bh, old_dir,
+				  &new_dir_bh, new_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	/* make sure both dirs have bhs
+	 * get an extra ref on old_dir_bh if old==new */
+	if (!new_dir_bh) {
+		if (old_dir_bh) {
+			new_dir_bh = old_dir_bh;
+			get_bh(new_dir_bh);
+		} else {
+			mlog(ML_ERROR, "no old_dir_bh!\n");
+			status = -EIO;
+			goto bail;
+		}
+	}
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		/* Directories actually require metadata updates to
+		 * the directory info so we can't get away with not
+		 * doing node locking on it. */
+		status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto bail;
+		}
+
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		status = -EIO;
+		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
+		if (!old_inode_de_bh)
+			goto bail;
+
+		status = -EIO;
+		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
+		    OCFS2_I(old_dir)->ip_blkno)
+			goto bail;
+		status = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+		    new_dir->i_nlink >= OCFS2_LINK_MAX)
+			goto bail;
+	} else {
+		/* Ah, the simple case - we're a file so just send a
+		 * message. */
+		status = ocfs2_request_rename_vote(old_inode, old_dentry);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	status = -ENOENT;
+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+				     old_dentry->d_name.len,
+				     old_dir, &old_de);
+	if (!old_de_bh)
+		goto bail;
+
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+		goto bail;
+
+	/* check if the target already exists (in which case we need
+	 * to delete it */
+	status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
+					  new_dentry->d_name.len,
+					  &newfe_blkno, new_dir, &new_de_bh,
+					  &new_de);
+	/* The only error we allow here is -ENOENT because the new
+	 * file not existing is perfectly valid. */
+	if ((status < 0) && (status != -ENOENT)) {
+		/* If we cannot find the file specified we should just */
+		/* return the error... */
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (!new_de && new_inode)
+		mlog(ML_ERROR, "inode %lu does not exist in it's parent "
+		     "directory!", new_inode->i_ino);
+
+	/* In case we need to overwrite an existing file, we blow it
+	 * away first */
+	if (new_de) {
+		/* VFS didn't think there existed an inode here, but
+		 * someone else in the cluster must have raced our
+		 * rename to create one. Today we error cleanly, in
+		 * the future we should consider calling iget to build
+		 * a new struct inode for this entry. */
+		if (!new_inode) {
+			status = -EACCES;
+
+			mlog(0, "We found an inode for name %.*s but VFS "
+			     "didn't give us one.\n", new_dentry->d_name.len,
+			     new_dentry->d_name.name);
+			goto bail;
+		}
+
+		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
+			status = -EACCES;
+
+			mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
+			     "disagree. ip_flags = %x\n",
+			     OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
+			     OCFS2_I(new_inode)->ip_flags);
+			goto bail;
+		}
+
+		status =
author	Mark Fasheh <mark.fasheh@oracle.com>	2005-12-15 14:31:24 -0800
committer	Joel Becker <joel.becker@oracle.com>	2006-01-03 11:45:47 -0800
commit	ccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch)
tree	c50ed941849ce06ccadd4ce27599b3ef9fdbe2ae /fs/ocfs2/namei.c
parent	8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff)