diff options
Diffstat (limited to 'fs/ocfs2')
67 files changed, 1443 insertions, 1404 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b3298..ce210d4951a 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -38,7 +38,6 @@ ocfs2-objs := \  	symlink.o 		\  	sysfile.o 		\  	uptodate.o		\ -	ver.o			\  	quota_local.o		\  	quota_global.o		\  	xattr.o			\ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index b4f788e0ca3..7e8282dcea2 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,  	return acl;  } - -/* - * Get posix acl. - */ -static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) -{ -	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -	struct buffer_head *di_bh = NULL; -	struct posix_acl *acl; -	int ret; - -	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) -		return NULL; - -	ret = ocfs2_inode_lock(inode, &di_bh, 0); -	if (ret < 0) { -		mlog_errno(ret); -		acl = ERR_PTR(ret); -		return acl; -	} - -	acl = ocfs2_get_acl_nolock(inode, type, di_bh); - -	ocfs2_inode_unlock(inode, 0); - -	brelse(di_bh); - -	return acl; -} -  /*   * Helper function to set i_mode in memory and disk. Some call paths   * will not have di_bh or a journal handle to pass, in which case it @@ -235,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,  	di->i_mode = cpu_to_le16(inode->i_mode);  	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);  	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_journal_dirty(handle, di_bh); @@ -250,7 +221,7 @@ out:  /*   * Set the access or default ACL of an inode.   */ -static int ocfs2_set_acl(handle_t *handle, +int ocfs2_set_acl(handle_t *handle,  			 struct inode *inode,  			 struct buffer_head *di_bh,  			 int type, @@ -313,6 +284,11 @@ static int ocfs2_set_acl(handle_t *handle,  	return ret;  } +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ +	return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); +} +  struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)  {  	struct ocfs2_super *osb; @@ -334,200 +310,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)  	return acl;  } - -int ocfs2_acl_chmod(struct inode *inode) -{ -	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -	struct posix_acl *acl; -	int ret; - -	if (S_ISLNK(inode->i_mode)) -		return -EOPNOTSUPP; - -	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) -		return 0; - -	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); -	if (IS_ERR(acl) || !acl) -		return PTR_ERR(acl); -	ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); -	if (ret) -		return ret; -	ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, -			    acl, NULL, NULL); -	posix_acl_release(acl); -	return ret; -} - -/* - * Initialize the ACLs of a new inode. If parent directory has default ACL, - * then clone to new inode. Called from ocfs2_mknod. - */ -int ocfs2_init_acl(handle_t *handle, -		   struct inode *inode, -		   struct inode *dir, -		   struct buffer_head *di_bh, -		   struct buffer_head *dir_bh, -		   struct ocfs2_alloc_context *meta_ac, -		   struct ocfs2_alloc_context *data_ac) -{ -	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -	struct posix_acl *acl = NULL; -	int ret = 0, ret2; -	umode_t mode; - -	if (!S_ISLNK(inode->i_mode)) { -		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { -			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, -						   dir_bh); -			if (IS_ERR(acl)) -				return PTR_ERR(acl); -		} -		if (!acl) { -			mode = inode->i_mode & ~current_umask(); -			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); -			if (ret) { -				mlog_errno(ret); -				goto cleanup; -			} -		} -	} -	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { -		if (S_ISDIR(inode->i_mode)) { -			ret = ocfs2_set_acl(handle, inode, di_bh, -					    ACL_TYPE_DEFAULT, acl, -					    meta_ac, data_ac); -			if (ret) -				goto cleanup; -		} -		mode = inode->i_mode; -		ret = posix_acl_create(&acl, GFP_NOFS, &mode); -		if (ret < 0) -			return ret; - -		ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); -		if (ret2) { -			mlog_errno(ret2); -			ret = ret2; -			goto cleanup; -		} -		if (ret > 0) { -			ret = ocfs2_set_acl(handle, inode, -					    di_bh, ACL_TYPE_ACCESS, -					    acl, meta_ac, data_ac); -		} -	} -cleanup: -	posix_acl_release(acl); -	return ret; -} - -static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, -					  char *list, -					  size_t list_len, -					  const char *name, -					  size_t name_len, -					  int type) -{ -	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); -	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - -	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) -		return 0; - -	if (list && size <= list_len) -		memcpy(list, POSIX_ACL_XATTR_ACCESS, size); -	return size; -} - -static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, -					   char *list, -					   size_t list_len, -					   const char *name, -					   size_t name_len, -					   int type) -{ -	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); -	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - -	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) -		return 0; - -	if (list && size <= list_len) -		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); -	return size; -} - -static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, -		void *buffer, size_t size, int type) -{ -	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); -	struct posix_acl *acl; -	int ret; - -	if (strcmp(name, "") != 0) -		return -EINVAL; -	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) -		return -EOPNOTSUPP; - -	acl = ocfs2_get_acl(dentry->d_inode, type); -	if (IS_ERR(acl)) -		return PTR_ERR(acl); -	if (acl == NULL) -		return -ENODATA; -	ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); -	posix_acl_release(acl); - -	return ret; -} - -static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, -		const void *value, size_t size, int flags, int type) -{ -	struct inode *inode = dentry->d_inode; -	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -	struct posix_acl *acl; -	int ret = 0; - -	if (strcmp(name, "") != 0) -		return -EINVAL; -	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) -		return -EOPNOTSUPP; - -	if (!inode_owner_or_capable(inode)) -		return -EPERM; - -	if (value) { -		acl = posix_acl_from_xattr(&init_user_ns, value, size); -		if (IS_ERR(acl)) -			return PTR_ERR(acl); -		else if (acl) { -			ret = posix_acl_valid(acl); -			if (ret) -				goto cleanup; -		} -	} else -		acl = NULL; - -	ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); - -cleanup: -	posix_acl_release(acl); -	return ret; -} - -const struct xattr_handler ocfs2_xattr_acl_access_handler = { -	.prefix	= POSIX_ACL_XATTR_ACCESS, -	.flags	= ACL_TYPE_ACCESS, -	.list	= ocfs2_xattr_list_acl_access, -	.get	= ocfs2_xattr_get_acl, -	.set	= ocfs2_xattr_set_acl, -}; - -const struct xattr_handler ocfs2_xattr_acl_default_handler = { -	.prefix	= POSIX_ACL_XATTR_DEFAULT, -	.flags	= ACL_TYPE_DEFAULT, -	.list	= ocfs2_xattr_list_acl_default, -	.get	= ocfs2_xattr_get_acl, -	.set	= ocfs2_xattr_set_acl, -}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 071fbd380f2..3fce68d0862 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -27,10 +27,13 @@ struct ocfs2_acl_entry {  };  struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -extern int ocfs2_acl_chmod(struct inode *); -extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, -			  struct buffer_head *, struct buffer_head *, -			  struct ocfs2_alloc_context *, -			  struct ocfs2_alloc_context *); +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_set_acl(handle_t *handle, +			 struct inode *inode, +			 struct buffer_head *di_bh, +			 int type, +			 struct posix_acl *acl, +			 struct ocfs2_alloc_context *meta_ac, +			 struct ocfs2_alloc_context *data_ac);  #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 17e6bdde96c..9d8fcf2f3b9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1025,7 +1025,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,  		for(i = count;  i < (num_got + count); i++) {  			bhs[i] = sb_getblk(osb->sb, first_blkno);  			if (bhs[i] == NULL) { -				status = -EIO; +				status = -ENOMEM;  				mlog_errno(status);  				goto bail;  			} @@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  				enum ocfs2_alloc_restarted *reason_ret)  {  	int status = 0, err = 0; +	int need_free = 0;  	int free_extents;  	enum ocfs2_alloc_restarted reason = RESTART_NONE;  	u32 bit_off, num_bits; @@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  					      OCFS2_JOURNAL_ACCESS_WRITE);  	if (status < 0) {  		mlog_errno(status); -		goto leave; +		need_free = 1; +		goto bail;  	}  	block = ocfs2_clusters_to_blocks(osb->sb, bit_off); @@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  				     num_bits, flags, meta_ac);  	if (status < 0) {  		mlog_errno(status); -		goto leave; +		need_free = 1; +		goto bail;  	}  	ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,  		reason = RESTART_TRANS;  	} +bail: +	if (need_free) { +		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) +			ocfs2_free_local_alloc_bits(osb, handle, data_ac, +					bit_off, num_bits); +		else +			ocfs2_free_clusters(handle, +					data_ac->ac_inode, +					data_ac->ac_bh, +					ocfs2_clusters_to_blocks(osb->sb, bit_off), +					num_bits); +	} +  leave:  	if (reason_ret)  		*reason_ret = reason; @@ -5712,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode,  	}  	ocfs2_et_update_clusters(et, -len); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_journal_dirty(handle, et->et_root_bh); @@ -6029,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)  void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,  				       int cancel)  { -	if (osb->osb_tl_inode) { +	if (osb->osb_tl_inode && +			atomic_read(&osb->osb_tl_disable) == 0) {  		/* We want to push off log flushes while truncates are  		 * still running. */  		if (cancel) @@ -6206,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)  	int status;  	struct inode *tl_inode = osb->osb_tl_inode; +	atomic_set(&osb->osb_tl_disable, 1); +  	if (tl_inode) {  		cancel_delayed_work(&osb->osb_truncate_log_wq);  		flush_workqueue(ocfs2_wq); @@ -6237,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)  	 * until we're sure all is well. */  	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,  			  ocfs2_truncate_log_worker); +	atomic_set(&osb->osb_tl_disable, 0);  	osb->osb_tl_bh    = tl_bh;  	osb->osb_tl_inode = tl_inode; @@ -6805,6 +6826,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  					 struct buffer_head *di_bh)  {  	int ret, i, has_data, num_pages = 0; +	int need_free = 0; +	u32 bit_off, num;  	handle_t *handle;  	u64 uninitialized_var(block);  	struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -6850,7 +6873,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  	}  	if (has_data) { -		u32 bit_off, num;  		unsigned int page_end;  		u64 phys; @@ -6886,6 +6908,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);  		if (ret) {  			mlog_errno(ret); +			need_free = 1;  			goto out_commit;  		} @@ -6896,6 +6919,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);  		if (ret) {  			mlog_errno(ret); +			need_free = 1;  			goto out_commit;  		} @@ -6913,6 +6937,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);  	spin_unlock(&oi->ip_lock); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_dinode_new_extent_list(inode, di);  	ocfs2_journal_dirty(handle, di_bh); @@ -6927,6 +6952,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,  		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);  		if (ret) {  			mlog_errno(ret); +			need_free = 1;  			goto out_commit;  		} @@ -6938,6 +6964,18 @@ out_commit:  		dquot_free_space_nodirty(inode,  					  ocfs2_clusters_to_bytes(osb->sb, 1)); +	if (need_free) { +		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) +			ocfs2_free_local_alloc_bits(osb, handle, data_ac, +					bit_off, num); +		else +			ocfs2_free_clusters(handle, +					data_ac->ac_inode, +					data_ac->ac_bh, +					ocfs2_clusters_to_blocks(osb->sb, bit_off), +					num); +	} +  	ocfs2_commit_trans(osb, handle);  out_unlock: @@ -7126,7 +7164,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,  	if (end > i_size_read(inode))  		end = i_size_read(inode); -	BUG_ON(start >= end); +	BUG_ON(start > end);  	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||  	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || @@ -7176,6 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,  	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);  	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_journal_dirty(handle, di_bh);  out_commit: @@ -7260,14 +7299,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)  	start = range->start >> osb->s_clustersize_bits;  	len = range->len >> osb->s_clustersize_bits;  	minlen = range->minlen >> osb->s_clustersize_bits; -	trimmed = 0; -	if (!len) { -		range->len = 0; -		return 0; -	} - -	if (minlen >= osb->bitmap_cpg) +	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)  		return -EINVAL;  	main_bm_inode = ocfs2_get_system_file_inode(osb, @@ -7293,6 +7326,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)  		goto out_unlock;  	} +	len = range->len >> osb->s_clustersize_bits;  	if (start + len > le32_to_cpu(main_bm->i_clusters))  		len = le32_to_cpu(main_bm->i_clusters) - start; @@ -7307,6 +7341,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)  	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);  	last_bit = osb->bitmap_cpg; +	trimmed = 0;  	for (group = first_group; group <= last_group;) {  		if (first_bit + len >= osb->bitmap_cpg)  			last_bit = osb->bitmap_cpg; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f37d3c0e205..4a231a166cf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -80,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,  						    le32_to_cpu(fe->i_clusters))) { +		err = -ENOMEM;  		mlog(ML_ERROR, "block offset is outside the allocated size: "  		     "%llu\n", (unsigned long long)iblock);  		goto bail; @@ -92,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  			    iblock;  		buffer_cache_bh = sb_getblk(osb->sb, blkno);  		if (!buffer_cache_bh) { +			err = -ENOMEM;  			mlog(ML_ERROR, "couldn't getblock for symlink!\n");  			goto bail;  		} @@ -569,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,  {  	struct inode *inode = file_inode(iocb->ki_filp);  	int level; -	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);  	/* this io's submitter should not have unlocked this before we could */  	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -580,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,  	if (ocfs2_iocb_is_unaligned_aio(iocb)) {  		ocfs2_iocb_clear_unaligned_aio(iocb); -		if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && -		    waitqueue_active(wq)) { -			wake_up_all(wq); -		} +		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);  	}  	ocfs2_iocb_clear_rw_locked(iocb); @@ -592,33 +590,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,  	ocfs2_rw_unlock(inode, level);  } -/* - * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen - * from ext3.  PageChecked() bits have been removed as OCFS2 does not - * do journalled data. - */ -static void ocfs2_invalidatepage(struct page *page, unsigned int offset, -				 unsigned int length) -{ -	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - -	jbd2_journal_invalidatepage(journal, page, offset, length); -} -  static int ocfs2_releasepage(struct page *page, gfp_t wait)  { -	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; -  	if (!page_has_buffers(page))  		return 0; -	return jbd2_journal_try_to_free_buffers(journal, page, wait); +	return try_to_free_buffers(page);  }  static ssize_t ocfs2_direct_IO(int rw,  			       struct kiocb *iocb, -			       const struct iovec *iov, -			       loff_t offset, -			       unsigned long nr_segs) +			       struct iov_iter *iter, +			       loff_t offset)  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file_inode(file)->i_mapping->host; @@ -635,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw,  		return 0;  	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, -				    iov, offset, nr_segs, +				    iter, offset,  				    ocfs2_direct_IO_get_blocks,  				    ocfs2_dio_end_io, NULL, 0);  } @@ -1802,8 +1784,7 @@ try_again:  			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;  		credits = ocfs2_calc_extend_credits(inode->i_sb, -						    &di->id2.i_list, -						    clusters_to_alloc); +						    &di->id2.i_list);  	} @@ -1897,10 +1878,14 @@ out_commit:  out:  	ocfs2_free_write_ctxt(wc); -	if (data_ac) +	if (data_ac) {  		ocfs2_free_alloc_context(data_ac); -	if (meta_ac) +		data_ac = NULL; +	} +	if (meta_ac) {  		ocfs2_free_alloc_context(meta_ac); +		meta_ac = NULL; +	}  	if (ret == -ENOSPC && try_free) {  		/* @@ -2053,6 +2038,7 @@ out_write_size:  	inode->i_mtime = inode->i_ctime = CURRENT_TIME;  	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);  	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_journal_dirty(handle, wc->w_di_bh);  	ocfs2_commit_trans(osb, handle); @@ -2087,7 +2073,7 @@ const struct address_space_operations ocfs2_aops = {  	.write_end		= ocfs2_write_end,  	.bmap			= ocfs2_bmap,  	.direct_IO		= ocfs2_direct_IO, -	.invalidatepage		= ocfs2_invalidatepage, +	.invalidatepage		= block_invalidatepage,  	.releasepage		= ocfs2_releasepage,  	.migratepage		= buffer_migrate_page,  	.is_partially_uptodate	= block_is_partially_uptodate, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f671e49beb3..6cae155d54d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits {  #define ocfs2_iocb_is_unaligned_aio(iocb) \  	test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define OCFS2_IOEND_WQ_HASH_SZ	37 -#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\ -					    OCFS2_IOEND_WQ_HASH_SZ]) -extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; -  #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 5d18ad10c27..1edcb141f63 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,  		 * information for this bh as it's not marked locally  		 * uptodate. */  		ret = -EIO; -		put_bh(bh);  		mlog_errno(ret);  	} @@ -115,7 +114,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,  		if (bhs[i] == NULL) {  			bhs[i] = sb_getblk(osb->sb, block++);  			if (bhs[i] == NULL) { -				status = -EIO; +				status = -ENOMEM;  				mlog_errno(status);  				goto bail;  			} @@ -214,7 +213,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,  			bhs[i] = sb_getblk(sb, block++);  			if (bhs[i] == NULL) {  				ocfs2_metadata_cache_io_unlock(ci); -				status = -EIO; +				status = -ENOMEM;  				mlog_errno(status);  				goto bail;  			} @@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,  	if (!buffer_uptodate(bh)) {  		ret = -EIO; -		put_bh(bh);  		mlog_errno(ret);  	} diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7d860..1aefc0350ec 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,4 @@  obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o  ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ -	quorum.o tcp.o netdebug.o ver.o +	quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 363f0dcc924..73039295d0d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -35,6 +35,7 @@  #include <linux/time.h>  #include <linux/debugfs.h>  #include <linux/slab.h> +#include <linux/bitmap.h>  #include "heartbeat.h"  #include "tcp.h" @@ -282,15 +283,6 @@ struct o2hb_bio_wait_ctxt {  	int               wc_error;  }; -static int o2hb_pop_count(void *map, int count) -{ -	int i = -1, pop = 0; - -	while ((i = find_next_bit(map, count, i + 1)) < count) -		pop++; -	return pop; -} -  static void o2hb_write_timeout(struct work_struct *work)  {  	int failed, quorum; @@ -307,9 +299,9 @@ static void o2hb_write_timeout(struct work_struct *work)  		spin_lock_irqsave(&o2hb_live_lock, flags);  		if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))  			set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); -		failed = o2hb_pop_count(&o2hb_failed_region_bitmap, +		failed = bitmap_weight(o2hb_failed_region_bitmap,  					O2NM_MAX_REGIONS); -		quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, +		quorum = bitmap_weight(o2hb_quorum_region_bitmap,  					O2NM_MAX_REGIONS);  		spin_unlock_irqrestore(&o2hb_live_lock, flags); @@ -421,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,  	}  	/* Must put everything in 512 byte sectors for the bio... */ -	bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); +	bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);  	bio->bi_bdev = reg->hr_bdev;  	bio->bi_private = wc;  	bio->bi_end_io = o2hb_bio_end_io; @@ -765,7 +757,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)  	 * If global heartbeat active, unpin all regions if the  	 * region count > CUT_OFF  	 */ -	if (o2hb_pop_count(&o2hb_quorum_region_bitmap, +	if (bitmap_weight(o2hb_quorum_region_bitmap,  			   O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)  		o2hb_region_unpin(NULL);  unlock: @@ -954,23 +946,9 @@ out:  	return changed;  } -/* This could be faster if we just implmented a find_last_bit, but I - * don't think the circumstances warrant it. */ -static int o2hb_highest_node(unsigned long *nodes, -			     int numbits) +static int o2hb_highest_node(unsigned long *nodes, int numbits)  { -	int highest, node; - -	highest = numbits; -	node = -1; -	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { -		if (node >= numbits) -			break; - -		highest = node; -	} - -	return highest; +	return find_last_bit(nodes, numbits);  }  static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) @@ -1129,7 +1107,7 @@ static int o2hb_thread(void *data)  	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); -	set_user_nice(current, -20); +	set_user_nice(current, MIN_NICE);  	/* Pin node */  	o2nm_depend_this_node(); @@ -1829,7 +1807,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,  	live_threshold = O2HB_LIVE_THRESHOLD;  	if (o2hb_global_heartbeat_active()) {  		spin_lock(&o2hb_live_lock); -		if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) +		if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)  			live_threshold <<= 1;  		spin_unlock(&o2hb_live_lock);  	} @@ -2180,7 +2158,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,  	if (!o2hb_dependent_users)  		goto unlock; -	if (o2hb_pop_count(&o2hb_quorum_region_bitmap, +	if (bitmap_weight(o2hb_quorum_region_bitmap,  			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)  		o2hb_region_pin(NULL); @@ -2480,7 +2458,7 @@ static int o2hb_region_inc_user(const char *region_uuid)  	if (o2hb_dependent_users > 1)  		goto unlock; -	if (o2hb_pop_count(&o2hb_quorum_region_bitmap, +	if (bitmap_weight(o2hb_quorum_region_bitmap,  			   O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)  		ret = o2hb_region_pin(NULL); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index baa2b9ef7ee..2260fb9e650 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -199,7 +199,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;  #define mlog_errno(st) do {						\  	int _st = (st);							\  	if (_st != -ERESTARTSYS && _st != -EINTR &&			\ -	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\ +	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC &&		\ +	    _st != -EDQUOT)						\  		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\  } while (0) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb240647ca5..441c84e169e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -29,7 +29,6 @@  #include "heartbeat.h"  #include "masklog.h"  #include "sys.h" -#include "ver.h"  /* for now we operate under the assertion that there can be only one   * cluster active at a time.  Changing this will require trickling @@ -945,8 +944,6 @@ static int __init init_o2nm(void)  {  	int ret = -1; -	cluster_print_version(); -  	ret = o2hb_init();  	if (ret)  		goto out; @@ -984,6 +981,7 @@ out:  MODULE_AUTHOR("Oracle");  MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management");  module_init(init_o2nm)  module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b07730b2e..b7f57271d49 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,  	return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);  }  static struct kobj_attribute attr_version = -	__ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); +	__ATTR(interface_revision, S_IRUGO, version_show, NULL);  static struct attribute *o2cb_attrs[] = {  	&attr_version.attr, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406b414..681691bc233 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;  static struct o2net_node o2net_nodes[O2NM_MAX_NODES];  /* XXX someday we'll need better accounting */ -static struct socket *o2net_listen_sock = NULL; +static struct socket *o2net_listen_sock;  /*   * listen work is only queued by the listening socket callbacks on the @@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =  static void o2net_sc_connect_completed(struct work_struct *work);  static void o2net_rx_until_empty(struct work_struct *work);  static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk);  static void o2net_sc_send_keep_req(struct work_struct *work);  static void o2net_idle_timer(unsigned long data);  static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); @@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)  #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void)  {  	return o2nm_single_cluster->cl_reconnect_delay_ms;  } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void)  {  	return o2nm_single_cluster->cl_keepalive_delay_ms;  } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void)  {  	return o2nm_single_cluster->cl_idle_timeout_ms;  } @@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,  }  /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk)  { -	void (*ready)(struct sock *sk, int bytes); +	void (*ready)(struct sock *sk);  	read_lock(&sk->sk_callback_lock);  	if (sk->sk_user_data) { @@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)  	}  	read_unlock(&sk->sk_callback_lock); -	ready(sk, bytes); +	ready(sk);  }  /* see o2net_register_callbacks() */ @@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)  static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)  { -	int ret; -	mm_segment_t oldfs; -	struct kvec vec = { -		.iov_len = len, -		.iov_base = data, -	}; -	struct msghdr msg = { -		.msg_iovlen = 1, -		.msg_iov = (struct iovec *)&vec, -       		.msg_flags = MSG_DONTWAIT, -	}; - -	oldfs = get_fs(); -	set_fs(get_ds()); -	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); -	set_fs(oldfs); - -	return ret; +	struct kvec vec = { .iov_len = len, .iov_base = data, }; +	struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; +	return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);  }  static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,  			      size_t veclen, size_t total)  {  	int ret; -	mm_segment_t oldfs; -	struct msghdr msg = { -		.msg_iov = (struct iovec *)vec, -		.msg_iovlen = veclen, -	}; +	struct msghdr msg;  	if (sock == NULL) {  		ret = -EINVAL;  		goto out;  	} -	oldfs = get_fs(); -	set_fs(get_ds()); -	ret = sock_sendmsg(sock, &msg, total); -	set_fs(oldfs); -	if (ret != total) { -		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, -		     total); -		if (ret >= 0) -			ret = -EPIPE; /* should be smarter, I bet */ -		goto out; -	} - -	ret = 0; +	ret = kernel_sendmsg(sock, &msg, vec, veclen, total); +	if (likely(ret == total)) +		return 0; +	mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); +	if (ret >= 0) +		ret = -EPIPE; /* should be smarter, I bet */  out: -	if (ret < 0) -		mlog(0, "returning error: %d\n", ret); +	mlog(0, "returning error: %d\n", ret);  	return ret;  } @@ -1826,7 +1799,7 @@ int o2net_register_hb_callbacks(void)  /* ------------------------------------------------------------ */ -static int o2net_accept_one(struct socket *sock) +static int o2net_accept_one(struct socket *sock, int *more)  {  	int ret, slen;  	struct sockaddr_in sin; @@ -1837,6 +1810,7 @@ static int o2net_accept_one(struct socket *sock)  	struct o2net_node *nn;  	BUG_ON(sock == NULL); +	*more = 0;  	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,  			       sock->sk->sk_protocol, &new_sock);  	if (ret) @@ -1848,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)  	if (ret < 0)  		goto out; +	*more = 1;  	new_sock->sk->sk_allocation = GFP_ATOMIC;  	ret = o2net_set_nodelay(new_sock); @@ -1946,16 +1921,41 @@ out:  	return ret;  } +/* + * This function is invoked in response to one or more + * pending accepts at softIRQ level. We must drain the + * entire que before returning. + */ +  static void o2net_accept_many(struct work_struct *work)  {  	struct socket *sock = o2net_listen_sock; -	while (o2net_accept_one(sock) == 0) +	int	more; +	int	err; + +	/* +	 * It is critical to note that due to interrupt moderation +	 * at the network driver level, we can't assume to get a +	 * softIRQ for every single conn since tcp SYN packets +	 * can arrive back-to-back, and therefore many pending +	 * accepts may result in just 1 softIRQ. If we terminate +	 * the o2net_accept_one() loop upon seeing an err, what happens +	 * to the rest of the conns in the queue? If no new SYN +	 * arrives for hours, no softIRQ  will be delivered, +	 * and the connections will just sit in the queue. +	 */ + +	for (;;) { +		err = o2net_accept_one(sock, &more); +		if (!more) +			break;  		cond_resched(); +	}  } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk)  { -	void (*ready)(struct sock *sk, int bytes); +	void (*ready)(struct sock *sk);  	read_lock(&sk->sk_callback_lock);  	ready = sk->sk_user_data; @@ -1964,18 +1964,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)  		goto out;  	} -	/* ->sk_data_ready is also called for a newly established child socket -	 * before it has been accepted and the acceptor has set up their -	 * data_ready.. we only want to queue listen work for our listening -	 * socket */ +	/* This callback may called twice when a new connection +	 * is  being established as a child socket inherits everything +	 * from a parent LISTEN socket, including the data_ready cb of +	 * the parent. This leads to a hazard. In o2net_accept_one() +	 * we are still initializing the child socket but have not +	 * changed the inherited data_ready callback yet when +	 * data starts arriving. +	 * We avoid this hazard by checking the state. +	 * For the listening socket,  the state will be TCP_LISTEN; for the new +	 * socket, will be  TCP_ESTABLISHED. Also, in this case, +	 * sk->sk_user_data is not a valid function pointer. +	 */ +  	if (sk->sk_state == TCP_LISTEN) { -		mlog(ML_TCP, "bytes: %d\n", bytes);  		queue_work(o2net_wq, &o2net_listen_work); +	} else { +		ready = NULL;  	}  out:  	read_unlock(&sk->sk_callback_lock); -	ready(sk, bytes); +	if (ready != NULL) +		ready(sk);  }  static int o2net_open_listening_sock(__be32 addr, __be16 port) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a..dc024367110 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -165,7 +165,7 @@ struct o2net_sock_container {  	/* original handlers for the sockets */  	void			(*sc_state_change)(struct sock *sk); -	void			(*sc_data_ready)(struct sock *sk, int bytes); +	void			(*sc_data_ready)(struct sock *sk);  	u32			sc_msg_key;  	u16			sc_msg_type; diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6abad..00000000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ -	printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3382c..00000000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 0d3a97d2d5f..e2e05a106be 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@  #include "dlmglue.h"  #include "file.h"  #include "inode.h" -#include "super.h"  #include "ocfs2_trace.h"  void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach:  	return ret;  } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ -	struct ocfs2_dentry_lock *dl; - -	spin_lock(&dentry_list_lock); -	while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { -		dl = osb->dentry_lock_list; -		osb->dentry_lock_list = dl->dl_next; -		spin_unlock(&dentry_list_lock); -		iput(dl->dl_inode); -		kfree(dl); -		spin_lock(&dentry_list_lock); -	} -	spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ -	struct ocfs2_super *osb = container_of(work, struct ocfs2_super, -					       dentry_lock_work); - -	__ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); -	/* -	 * Don't queue dropping if umount is in progress. We flush the -	 * list in ocfs2_dismount_volume -	 */ -	spin_lock(&dentry_list_lock); -	if (osb->dentry_lock_list && -	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) -		queue_work(ocfs2_wq, &osb->dentry_lock_work); -	spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ -	__ocfs2_drop_dl_inodes(osb, -1); -} -  /*   * ocfs2_dentry_iput() and friends.   * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)  static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,  				   struct ocfs2_dentry_lock *dl)  { +	iput(dl->dl_inode);  	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);  	ocfs2_lock_res_free(&dl->dl_lockres); - -	/* We leave dropping of inode reference to ocfs2_wq as that can -	 * possibly lead to inode deletion which gets tricky */ -	spin_lock(&dentry_list_lock); -	if (!osb->dentry_lock_list && -	    !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) -		queue_work(ocfs2_wq, &osb->dentry_lock_work); -	dl->dl_next = osb->dentry_lock_list; -	osb->dentry_lock_list = dl; -	spin_unlock(&dentry_list_lock); +	kfree(dl);  }  void ocfs2_dentry_lock_put(struct ocfs2_super *osb,  			   struct ocfs2_dentry_lock *dl)  { -	int unlock; +	int unlock = 0;  	BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff70995..55f58892b15 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,13 +29,8 @@  extern const struct dentry_operations ocfs2_dentry_ops;  struct ocfs2_dentry_lock { -	/* Use count of dentry lock */  	unsigned int		dl_count; -	union { -		/* Linked list of dentry locks to release */ -		struct ocfs2_dentry_lock *dl_next; -		u64			dl_parent_blkno; -	}; +	u64			dl_parent_blkno;  	/*  	 * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {  int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,  			     u64 parent_blkno); -extern spinlock_t dentry_list_lock; -  void ocfs2_dentry_lock_put(struct ocfs2_super *osb,  			   struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); -  struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,  				      int skip_unhashed); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 30544ce8e9f..0717662b4ae 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2349,7 +2349,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,  	dx_root_bh = sb_getblk(osb->sb, dr_blkno);  	if (dx_root_bh == NULL) { -		ret = -EIO; +		ret = -ENOMEM;  		goto out;  	}  	ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); @@ -2422,7 +2422,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,  	for (i = 0; i < num_dx_leaves; i++) {  		bh = sb_getblk(osb->sb, start_blk + i);  		if (bh == NULL) { -			ret = -EIO; +			ret = -ENOMEM;  			goto out;  		}  		dx_leaves[i] = bh; @@ -2929,7 +2929,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,  	blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);  	dirdata_bh = sb_getblk(sb, blkno);  	if (!dirdata_bh) { -		ret = -EIO; +		ret = -ENOMEM;  		mlog_errno(ret);  		goto out_commit;  	} @@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,  		ocfs2_init_dir_trailer(dir, dirdata_bh, i);  	} +	ocfs2_update_inode_fsync_trans(handle, dir, 1);  	ocfs2_journal_dirty(handle, dirdata_bh);  	if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,  	di->i_size = cpu_to_le64(sb->s_blocksize);  	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);  	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, dir, 1);  	/*  	 * This should never fail as our extent list is empty and all @@ -3159,7 +3161,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,  	*new_bh = sb_getblk(sb, p_blkno);  	if (!*new_bh) { -		status = -EIO; +		status = -ENOMEM;  		mlog_errno(status);  		goto bail;  	} @@ -3284,7 +3286,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,  		if (ocfs2_dir_resv_allowed(osb))  			data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv; -		credits = ocfs2_calc_extend_credits(sb, el, 1); +		credits = ocfs2_calc_extend_credits(sb, el);  	} else {  		spin_unlock(&OCFS2_I(dir)->ip_lock);  		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -3338,6 +3340,7 @@ do_extend:  	} else {  		de->rec_len = cpu_to_le16(sb->s_blocksize);  	} +	ocfs2_update_inode_fsync_trans(handle, dir, 1);  	ocfs2_journal_dirty(handle, new_bh);  	dir_i_size += dir->i_sb->s_blocksize; @@ -3716,7 +3719,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,  {  	int credits = ocfs2_clusters_to_blocks(osb->sb, 2); -	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1); +	credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);  	credits += ocfs2_quota_trans_credits(osb->sb);  	return credits;  } @@ -3896,6 +3899,7 @@ out_commit:  		dquot_free_space_nodirty(dir,  				ocfs2_clusters_to_bytes(dir->i_sb, 1)); +	ocfs2_update_inode_fsync_trans(handle, dir, 1);  	ocfs2_commit_trans(osb, handle);  out: @@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,  		mlog_errno(ret);  	did_quota = 0; +	ocfs2_update_inode_fsync_trans(handle, dir, 1);  	ocfs2_journal_dirty(handle, dx_root_bh);  out_commit: @@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,  	di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);  	spin_unlock(&OCFS2_I(dir)->ip_lock);  	di->i_dx_root = cpu_to_le64(0ULL); +	ocfs2_update_inode_fsync_trans(handle, dir, 1);  	ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044efbb1..bd1aab1f49a 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2  obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o  ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ -	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o +	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e0517762fcc..fae17c640df 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)  struct dlm_recovery_ctxt  {  	struct list_head resources; -	struct list_head received;  	struct list_head node_data;  	u8  new_master;  	u8  dead_node; @@ -332,6 +331,7 @@ struct dlm_lock_resource  	u16 state;  	char lvb[DLM_LVB_LEN];  	unsigned int inflight_locks; +	unsigned int inflight_assert_workers;  	unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];  }; @@ -911,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,  void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,  				   struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, +		struct dlm_lock_resource *res); +  void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);  void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);  void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e33cd7a3c58..18f13c2e4a1 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)  #ifdef CONFIG_DEBUG_FS -static struct dentry *dlm_debugfs_root = NULL; +static struct dentry *dlm_debugfs_root;  #define DLM_DEBUGFS_DIR				"o2dlm"  #define DLM_DEBUGFS_DLM_STATE			"dlm_state" diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8b3382abf84..39efc5057a3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -43,8 +43,6 @@  #include "dlmdomain.h"  #include "dlmdebug.h" -#include "dlmver.h" -  #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)  #include "cluster/masklog.h" @@ -961,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,  		 * domain. Set him in the map and clean up our  		 * leftover join state. */  		BUG_ON(dlm->joining_node != assert->node_idx); + +		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { +			mlog(0, "dlm recovery is ongoing, disallow join\n"); +			spin_unlock(&dlm->spinlock); +			spin_unlock(&dlm_domain_lock); +			return -EAGAIN; +		} +  		set_bit(assert->node_idx, dlm->domain_map);  		clear_bit(assert->node_idx, dlm->exit_domain_map);  		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -1125,7 +1131,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,  	struct dlm_ctxt *dlm = NULL;  	char *local = NULL;  	int status = 0; -	int locked = 0;  	qr = (struct dlm_query_region *) msg->buf; @@ -1134,10 +1139,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,  	/* buffer used in dlm_mast_regions() */  	local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); -	if (!local) { -		status = -ENOMEM; -		goto bail; -	} +	if (!local) +		return -ENOMEM;  	status = -EINVAL; @@ -1146,16 +1149,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,  	if (!dlm) {  		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "  		     "before join domain\n", qr->qr_node, qr->qr_domain); -		goto bail; +		goto out_domain_lock;  	}  	spin_lock(&dlm->spinlock); -	locked = 1;  	if (dlm->joining_node != qr->qr_node) {  		mlog(ML_ERROR, "Node %d queried hb regions on domain %s "  		     "but joining node is %d\n", qr->qr_node, qr->qr_domain,  		     dlm->joining_node); -		goto bail; +		goto out_dlm_lock;  	}  	/* Support for global heartbeat was added in 1.1 */ @@ -1165,14 +1167,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,  		     "but active dlm protocol is %d.%d\n", qr->qr_node,  		     qr->qr_domain, dlm->dlm_locking_proto.pv_major,  		     dlm->dlm_locking_proto.pv_minor); -		goto bail; +		goto out_dlm_lock;  	}  	status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); -bail: -	if (locked) -		spin_unlock(&dlm->spinlock); +out_dlm_lock: +	spin_unlock(&dlm->spinlock); + +out_domain_lock:  	spin_unlock(&dlm_domain_lock);  	kfree(local); @@ -1522,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,  				    unsigned int node)  {  	int status; +	int ret;  	struct dlm_assert_joined assert_msg;  	mlog(0, "Sending join assert to node %u\n", node); @@ -1533,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,  	status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,  				    &assert_msg, sizeof(assert_msg), node, -				    NULL); +				    &ret);  	if (status < 0)  		mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "  		     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,  		     node); +	else +		status = ret;  	return status;  } @@ -1879,19 +1885,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)  		goto bail;  	} -	status = dlm_debug_init(dlm); +	status = dlm_launch_thread(dlm);  	if (status < 0) {  		mlog_errno(status);  		goto bail;  	} -	status = dlm_launch_thread(dlm); +	status = dlm_launch_recovery_thread(dlm);  	if (status < 0) {  		mlog_errno(status);  		goto bail;  	} -	status = dlm_launch_recovery_thread(dlm); +	status = dlm_debug_init(dlm);  	if (status < 0) {  		mlog_errno(status);  		goto bail; @@ -2028,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,  	INIT_LIST_HEAD(&dlm->list);  	INIT_LIST_HEAD(&dlm->dirty_list);  	INIT_LIST_HEAD(&dlm->reco.resources); -	INIT_LIST_HEAD(&dlm->reco.received);  	INIT_LIST_HEAD(&dlm->reco.node_data);  	INIT_LIST_HEAD(&dlm->purge_list);  	INIT_LIST_HEAD(&dlm->dlm_domain_handlers); @@ -2328,8 +2333,6 @@ static int __init dlm_init(void)  {  	int status; -	dlm_print_version(); -  	status = dlm_init_mle_cache();  	if (status) {  		mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); @@ -2379,6 +2382,7 @@ static void __exit dlm_exit (void)  MODULE_AUTHOR("Oracle");  MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");  module_init(dlm_init);  module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 5d32f7511f7..66c2a491f68 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -52,7 +52,7 @@  #define MLOG_MASK_PREFIX ML_DLM  #include "cluster/masklog.h" -static struct kmem_cache *dlm_lock_cache = NULL; +static struct kmem_cache *dlm_lock_cache;  static DEFINE_SPINLOCK(dlm_cookie_lock);  static u64 dlm_next_cookie = 1; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index cf0f103963b..82abf0cc9a1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,  	return 1;  } -static struct kmem_cache *dlm_lockres_cache = NULL; -static struct kmem_cache *dlm_lockname_cache = NULL; -static struct kmem_cache *dlm_mle_cache = NULL; +static struct kmem_cache *dlm_lockres_cache; +static struct kmem_cache *dlm_lockname_cache; +static struct kmem_cache *dlm_mle_cache;  static void dlm_mle_release(struct kref *kref);  static void dlm_init_mle(struct dlm_master_list_entry *mle, @@ -472,11 +472,15 @@ bail:  void dlm_destroy_master_caches(void)  { -	if (dlm_lockname_cache) +	if (dlm_lockname_cache) {  		kmem_cache_destroy(dlm_lockname_cache); +		dlm_lockname_cache = NULL; +	} -	if (dlm_lockres_cache) +	if (dlm_lockres_cache) {  		kmem_cache_destroy(dlm_lockres_cache); +		dlm_lockres_cache = NULL; +	}  }  static void dlm_lockres_release(struct kref *kref) @@ -577,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,  	atomic_set(&res->asts_reserved, 0);  	res->migration_pending = 0;  	res->inflight_locks = 0; +	res->inflight_assert_workers = 0;  	res->dlm = dlm; @@ -679,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,  	wake_up(&res->wq);  } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, +		struct dlm_lock_resource *res) +{ +	assert_spin_locked(&res->spinlock); +	res->inflight_assert_workers++; +	mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", +			dlm->name, res->lockname.len, res->lockname.name, +			res->inflight_assert_workers); +} + +static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, +		struct dlm_lock_resource *res) +{ +	spin_lock(&res->spinlock); +	__dlm_lockres_grab_inflight_worker(dlm, res); +	spin_unlock(&res->spinlock); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, +		struct dlm_lock_resource *res) +{ +	assert_spin_locked(&res->spinlock); +	BUG_ON(res->inflight_assert_workers == 0); +	res->inflight_assert_workers--; +	mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", +			dlm->name, res->lockname.len, res->lockname.name, +			res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, +		struct dlm_lock_resource *res) +{ +	spin_lock(&res->spinlock); +	__dlm_lockres_drop_inflight_worker(dlm, res); +	spin_unlock(&res->spinlock); +} +  /*   * lookup a lock resource by name.   * may already exist in the hashtable. @@ -1599,7 +1641,8 @@ send_response:  			mlog(ML_ERROR, "failed to dispatch assert master work\n");  			response = DLM_MASTER_RESP_ERROR;  			dlm_lockres_put(res); -		} +		} else +			dlm_lockres_grab_inflight_worker(dlm, res);  	} else {  		if (res)  			dlm_lockres_put(res); @@ -1885,8 +1928,10 @@ ok:  			 * up nodes that this node contacted */  			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,  						    nn+1)) < O2NM_MAX_NODES) { -				if (nn != dlm->node_num && nn != assert->node_idx) +				if (nn != dlm->node_num && nn != assert->node_idx) {  					master_request = 1; +					break; +				}  			}  		}  		mle->master = assert->node_idx; @@ -2112,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)  	dlm_lockres_release_ast(dlm, res);  put: +	dlm_lockres_drop_inflight_worker(dlm, res); +  	dlm_lockres_put(res);  	mlog(0, "finished with dlm_assert_master_worker\n"); @@ -2354,6 +2401,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,  	assert_spin_locked(&res->spinlock); +	/* delay migration when the lockres is in MIGRATING state */ +	if (res->state & DLM_LOCK_RES_MIGRATING) +		return 0; +  	if (res->owner != dlm->node_num)  		return 0; @@ -3078,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,  			/* remove it so that only one mle will be found */  			__dlm_unlink_mle(dlm, tmp);  			__dlm_mle_detach_hb_events(dlm, tmp); -			ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; -			mlog(0, "%s:%.*s: master=%u, newmaster=%u, " -			    "telling master to get ref for cleared out mle " -			    "during migration\n", dlm->name, namelen, name, -			    master, new_master); +			if (tmp->type == DLM_MLE_MASTER) { +				ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; +				mlog(0, "%s:%.*s: master=%u, newmaster=%u, " +						"telling master to get ref " +						"for cleared out mle during " +						"migration\n", dlm->name, +						namelen, name, master, +						new_master); +			}  		}  		spin_unlock(&tmp->spinlock);  	} diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0b5adca1b17..45067faf569 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -537,7 +537,10 @@ master_here:  		/* success!  see if any other nodes need recovery */  		mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",  		     dlm->name, dlm->reco.dead_node, dlm->node_num); -		dlm_reset_recovery(dlm); +		spin_lock(&dlm->spinlock); +		__dlm_reset_recovery(dlm); +		dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; +		spin_unlock(&dlm->spinlock);  	}  	dlm_end_recovery(dlm); @@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)  		if (all_nodes_done) {  			int ret; +			/* Set this flag on recovery master to avoid +			 * a new recovery for another dead node start +			 * before the recovery is not done. That may +			 * cause recovery hung.*/ +			spin_lock(&dlm->spinlock); +			dlm->reco.state |= DLM_RECO_STATE_FINALIZE; +			spin_unlock(&dlm->spinlock); +  			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state  	 		 * just send a finalize message to everyone and  	 		 * clean up */ @@ -1697,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,  				mlog_errno(-ENOMEM);  				/* retry!? */  				BUG(); -			} +			} else +				__dlm_lockres_grab_inflight_worker(dlm, res);  		} else /* put.. incase we are not the master */  			dlm_lockres_put(res);  		spin_unlock(&res->spinlock); @@ -1750,13 +1762,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,  				     struct dlm_migratable_lockres *mres)  {  	struct dlm_migratable_lock *ml; -	struct list_head *queue; +	struct list_head *queue, *iter;  	struct list_head *tmpq = NULL;  	struct dlm_lock *newlock = NULL;  	struct dlm_lockstatus *lksb = NULL;  	int ret = 0;  	int i, j, bad; -	struct dlm_lock *lock = NULL; +	struct dlm_lock *lock;  	u8 from = O2NM_MAX_NODES;  	unsigned int added = 0;  	__be64 c; @@ -1791,14 +1803,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,  			/* MIGRATION ONLY! */  			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); +			lock = NULL;  			spin_lock(&res->spinlock);  			for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {  				tmpq = dlm_list_idx_to_ptr(res, j); -				list_for_each_entry(lock, tmpq, list) { -					if (lock->ml.cookie != ml->cookie) -						lock = NULL; -					else +				list_for_each(iter, tmpq) { +					lock = list_entry(iter, +						  struct dlm_lock, list); +					if (lock->ml.cookie == ml->cookie)  						break; +					lock = NULL;  				}  				if (lock)  					break; @@ -1886,6 +1900,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,  		if (ml->type == LKM_NLMODE)  			goto skip_lvb; +		/* +		 * If the lock is in the blocked list it can't have a valid lvb, +		 * so skip it +		 */ +		if (ml->list == DLM_BLOCKED_LIST) +			goto skip_lvb; +  		if (!dlm_lvb_is_empty(mres->lvb)) {  			if (lksb->flags & DLM_LKSB_PUT_LVB) {  				/* other node was trying to update @@ -1966,7 +1987,15 @@ skip_lvb:  		}  		if (!bad) {  			dlm_lock_get(newlock); -			list_add_tail(&newlock->list, queue); +			if (mres->flags & DLM_MRES_RECOVERY && +					ml->list == DLM_CONVERTING_LIST && +					newlock->ml.type > +					newlock->ml.convert_type) { +				/* newlock is doing downconvert, add it to the +				 * head of converting list */ +				list_add(&newlock->list, queue); +			} else +				list_add_tail(&newlock->list, queue);  			mlog(0, "%s:%.*s: added lock for node %u, "  			     "setting refmap bit\n", dlm->name,  			     res->lockname.len, res->lockname.name, ml->node); @@ -2875,8 +2904,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,  				BUG();  			}  			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; +			__dlm_reset_recovery(dlm);  			spin_unlock(&dlm->spinlock); -			dlm_reset_recovery(dlm);  			dlm_kick_recovery_thread(dlm);  			break;  		default: diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 9db869de829..69aac6f088a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,  		 * refs on it. */  		unused = __dlm_lockres_unused(lockres);  		if (!unused || -		    (lockres->state & DLM_LOCK_RES_MIGRATING)) { +		    (lockres->state & DLM_LOCK_RES_MIGRATING) || +		    (lockres->inflight_assert_workers != 0)) {  			mlog(0, "%s: res %.*s is in use or being remastered, " -			     "used %d, state %d\n", dlm->name, -			     lockres->lockname.len, lockres->lockname.name, -			     !unused, lockres->state); -			list_move_tail(&dlm->purge_list, &lockres->purge); +			     "used %d, state %d, assert master workers %u\n", +			     dlm->name, lockres->lockname.len, +			     lockres->lockname.name, +			     !unused, lockres->state, +			     lockres->inflight_assert_workers); +			list_move_tail(&lockres->purge, &dlm->purge_list);  			spin_unlock(&lockres->spinlock);  			continue;  		} diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 5698b52cf5c..2e3c9dbab68 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,  				     DLM_UNLOCK_CLEAR_CONVERT_TYPE);  		} else if (status == DLM_RECOVERING ||  			   status == DLM_MIGRATING || -			   status == DLM_FORWARD) { +			   status == DLM_FORWARD || +			   status == DLM_NOLOCKMGR +			   ) {  			/* must clear the actions because this unlock  			 * is about to be retried.  cannot free or do  			 * any list manipulation. */ @@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,  			     res->lockname.name,  			     status==DLM_RECOVERING?"recovering":  			     (status==DLM_MIGRATING?"migrating": -			      "forward")); +				(status == DLM_FORWARD ? "forward" : +						"nolockmanager")));  			actions = 0;  		}  		if (flags & LKM_CANCEL) @@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,  			 * updated state to the recovery master.  this thread  			 * just needs to finish out the operation and call  			 * the unlockast. */ -			ret = DLM_NORMAL; +			if (dlm_is_node_dead(dlm, owner)) +				ret = DLM_NORMAL; +			else +				ret = DLM_NOLOCKMGR;  		} else {  			/* something bad.  this will BUG in ocfs2 */  			ret = dlm_err_to_dlm_status(tmpret); @@ -638,7 +644,9 @@ retry:  	if (status == DLM_RECOVERING ||  	    status == DLM_MIGRATING || -	    status == DLM_FORWARD) { +	    status == DLM_FORWARD || +	    status == DLM_NOLOCKMGR) { +  		/* We want to go away for a tiny bit to allow recovery  		 * / migration to complete on this resource. I don't  		 * know of any wait queue we could sleep on as this @@ -650,7 +658,7 @@ retry:  		msleep(50);  		mlog(0, "retrying unlock due to pending recovery/" -		     "migration/in-progress\n"); +		     "migration/in-progress/reconnect\n");  		goto retry;  	} diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4d158..00000000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION - -void dlm_print_version(void) -{ -	printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee77a1..00000000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLM_VER_H -#define DLM_VER_H - -void dlm_print_version(void); - -#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89a670..eed3db8c5b4 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2  obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index efa2b3d339e..09b7d9dac71 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -49,7 +49,6 @@  #include "stackglue.h"  #include "userdlm.h" -#include "dlmfsver.h"  #define MLOG_MASK_PREFIX ML_DLMFS  #include "cluster/masklog.h" @@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)  	int status;  	int cleanup_inode = 0, cleanup_worker = 0; -	dlmfs_print_version(); -  	status = bdi_init(&dlmfs_backing_dev_info);  	if (status)  		return status; @@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)  MODULE_AUTHOR("Oracle");  MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");  module_init(init_dlmfs_fs)  module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b3321f8..00000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ -	printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadbed25..00000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3a44a648dae..52cfe99ae05 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1304,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)  {  	wait_for_completion(&mw->mw_complete);  	/* Re-arm the completion in case we want to wait on it again */ -	INIT_COMPLETION(mw->mw_complete); +	reinit_completion(&mw->mw_complete);  	return mw->mw_status;  } @@ -1355,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,  	else  		ret = mw->mw_status;  	/* Re-arm the completion in case we want to wait on it again */ -	INIT_COMPLETION(mw->mw_complete); +	reinit_completion(&mw->mw_complete);  	return ret;  } @@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,  	 * refreshed, so we do it here. Of course, making sense of  	 * everything is up to the caller :) */  	status = ocfs2_should_refresh_lock_res(lockres); -	if (status < 0) { -		ocfs2_cluster_unlock(osb, lockres, level); -		mlog_errno(status); -		goto bail; -	}  	if (status) {  		status = ocfs2_refresh_slot_info(osb); @@ -2996,6 +2991,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)  	/* for now, uuid == domain */  	status = ocfs2_cluster_connect(osb->osb_cluster_stack, +				       osb->osb_cluster_name, +				       strlen(osb->osb_cluster_name),  				       osb->uuid_str,  				       strlen(osb->uuid_str),  				       &lproto, ocfs2_do_node_down, osb, @@ -3005,7 +3002,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)  		goto bail;  	} -	status = ocfs2_cluster_this_node(&osb->node_num); +	status = ocfs2_cluster_this_node(conn, &osb->node_num);  	if (status < 0) {  		mlog_errno(status);  		mlog(ML_ERROR, @@ -3142,22 +3139,60 @@ out:  	return 0;  } +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, +				       struct ocfs2_lock_res *lockres); +  /* Mark the lockres as being dropped. It will no longer be   * queued if blocking, but we still may have to wait on it   * being dequeued from the downconvert thread before we can consider   * it safe to drop.   *   * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, +				struct ocfs2_lock_res *lockres)  {  	int status;  	struct ocfs2_mask_waiter mw; -	unsigned long flags; +	unsigned long flags, flags2;  	ocfs2_init_mask_waiter(&mw);  	spin_lock_irqsave(&lockres->l_lock, flags);  	lockres->l_flags |= OCFS2_LOCK_FREEING; +	if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { +		/* +		 * We know the downconvert is queued but not in progress +		 * because we are the downconvert thread and processing +		 * different lock. So we can just remove the lock from the +		 * queue. This is not only an optimization but also a way +		 * to avoid the following deadlock: +		 *   ocfs2_dentry_post_unlock() +		 *     ocfs2_dentry_lock_put() +		 *       ocfs2_drop_dentry_lock() +		 *         iput() +		 *           ocfs2_evict_inode() +		 *             ocfs2_clear_inode() +		 *               ocfs2_mark_lockres_freeing() +		 *                 ... blocks waiting for OCFS2_LOCK_QUEUED +		 *                 since we are the downconvert thread which +		 *                 should clear the flag. +		 */ +		spin_unlock_irqrestore(&lockres->l_lock, flags); +		spin_lock_irqsave(&osb->dc_task_lock, flags2); +		list_del_init(&lockres->l_blocked_list); +		osb->blocked_lock_count--; +		spin_unlock_irqrestore(&osb->dc_task_lock, flags2); +		/* +		 * Warn if we recurse into another post_unlock call.  Strictly +		 * speaking it isn't a problem but we need to be careful if +		 * that happens (stack overflow, deadlocks, ...) so warn if +		 * ocfs2 grows a path for which this can happen. +		 */ +		WARN_ON_ONCE(lockres->l_ops->post_unlock); +		/* Since the lock is freeing we don't do much in the fn below */ +		ocfs2_process_blocked_lock(osb, lockres); +		return; +	}  	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {  		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);  		spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3178,7 +3213,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,  {  	int ret; -	ocfs2_mark_lockres_freeing(lockres); +	ocfs2_mark_lockres_freeing(osb, lockres);  	ret = ocfs2_drop_lock(osb, lockres);  	if (ret)  		mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4..d293a22c32c 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);  void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, +				struct ocfs2_lock_res *lockres);  void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,  			       struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d71903c6068..2930e231f3f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,  			   int datasync)  {  	int err = 0; -	journal_t *journal;  	struct inode *inode = file->f_mapping->host;  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +	struct ocfs2_inode_info *oi = OCFS2_I(inode); +	journal_t *journal = osb->journal->j_journal; +	int ret; +	tid_t commit_tid; +	bool needs_barrier = false;  	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,  			      OCFS2_I(inode)->ip_blkno, @@ -185,33 +189,26 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,  			      file->f_path.dentry->d_name.name,  			      (unsigned long long)datasync); +	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) +		return -EROFS; +  	err = filemap_write_and_wait_range(inode->i_mapping, start, end);  	if (err)  		return err; -	/* -	 * Probably don't need the i_mutex at all in here, just putting it here -	 * to be consistent with how fsync used to be called, someone more -	 * familiar with the fs could possibly remove it. -	 */ -	mutex_lock(&inode->i_mutex); -	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { -		/* -		 * We still have to flush drive's caches to get data to the -		 * platter -		 */ -		if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) -			blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); -		goto bail; +	commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; +	if (journal->j_flags & JBD2_BARRIER && +	    !jbd2_trans_will_send_data_barrier(journal, commit_tid)) +		needs_barrier = true; +	err = jbd2_complete_transaction(journal, commit_tid); +	if (needs_barrier) { +		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); +		if (!err) +			err = ret;  	} -	journal = osb->journal->j_journal; -	err = jbd2_journal_force_commit(journal); - -bail:  	if (err)  		mlog_errno(err); -	mutex_unlock(&inode->i_mutex);  	return (err < 0) ? -EIO : 0;  } @@ -289,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,  	inode->i_atime = CURRENT_TIME;  	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);  	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_journal_dirty(handle, bh);  out_commit: @@ -338,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,  	if (ret < 0)  		mlog_errno(ret); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_commit_trans(osb, handle);  out:  	return ret; @@ -432,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,  	di->i_size = cpu_to_le64(new_i_size);  	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);  	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_journal_dirty(handle, fe_bh); @@ -474,11 +474,6 @@ static int ocfs2_truncate_file(struct inode *inode,  		goto bail;  	} -	/* lets handle the simple truncate cases before doing any more -	 * cluster locking. */ -	if (new_i_size == le64_to_cpu(fe->i_size)) -		goto bail; -  	down_write(&OCFS2_I(inode)->ip_alloc_sem);  	ocfs2_resv_discard(&osb->osb_la_resmap, @@ -580,7 +575,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,  	int did_quota = 0;  	/* -	 * This function only exists for file systems which don't +	 * Unwritten extent only exists for file systems which  	 * support holes.  	 */  	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); @@ -603,8 +598,7 @@ restart_all:  		goto leave;  	} -	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, -					    clusters_to_add); +	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);  	handle = ocfs2_start_trans(osb, credits);  	if (IS_ERR(handle)) {  		status = PTR_ERR(handle); @@ -653,7 +647,7 @@ restarted_transaction:  			mlog_errno(status);  		goto leave;  	} - +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_journal_dirty(handle, bh);  	spin_lock(&OCFS2_I(inode)->ip_lock); @@ -719,7 +713,8 @@ leave:   * While a write will already be ordering the data, a truncate will not.   * Thus, we need to explicitly order the zeroed pages.   */ -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, +						struct buffer_head *di_bh)  {  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);  	handle_t *handle = NULL; @@ -736,8 +731,16 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)  	}  	ret = ocfs2_jbd2_file_inode(handle, inode); -	if (ret < 0) +	if (ret < 0) {  		mlog_errno(ret); +		goto out; +	} + +	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, +				      OCFS2_JOURNAL_ACCESS_WRITE); +	if (ret) +		mlog_errno(ret); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  out:  	if (ret) { @@ -752,7 +755,7 @@ out:   * to be too fragile to do exactly what we need without us having to   * worry about recursive locking in ->write_begin() and ->write_end(). */  static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, -				 u64 abs_to) +				 u64 abs_to, struct buffer_head *di_bh)  {  	struct address_space *mapping = inode->i_mapping;  	struct page *page; @@ -760,6 +763,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,  	handle_t *handle = NULL;  	int ret = 0;  	unsigned zero_from, zero_to, block_start, block_end; +	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;  	BUG_ON(abs_from >= abs_to);  	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); @@ -802,7 +806,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,  		}  		if (!handle) { -			handle = ocfs2_zero_start_ordered_transaction(inode); +			handle = ocfs2_zero_start_ordered_transaction(inode, +								      di_bh);  			if (IS_ERR(handle)) {  				ret = PTR_ERR(handle);  				handle = NULL; @@ -819,8 +824,23 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,  			ret = 0;  	} -	if (handle) +	if (handle) { +		/* +		 * fs-writeback will release the dirty pages without page lock +		 * whose offset are over inode size, the release happens at +		 * block_write_full_page(). +		 */ +		i_size_write(inode, abs_to); +		inode->i_blocks = ocfs2_inode_sector_count(inode); +		di->i_size = cpu_to_le64((u64)i_size_read(inode)); +		inode->i_mtime = inode->i_ctime = CURRENT_TIME; +		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); +		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); +		di->i_mtime_nsec = di->i_ctime_nsec; +		ocfs2_journal_dirty(handle, di_bh); +		ocfs2_update_inode_fsync_trans(handle, inode, 1);  		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +	}  out_unlock:  	unlock_page(page); @@ -916,7 +936,7 @@ out:   * has made sure that the entire range needs zeroing.   */  static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, -				   u64 range_end) +				   u64 range_end, struct buffer_head *di_bh)  {  	int rc = 0;  	u64 next_pos; @@ -932,7 +952,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,  		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;  		if (next_pos > range_end)  			next_pos = range_end; -		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); +		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);  		if (rc < 0) {  			mlog_errno(rc);  			break; @@ -978,7 +998,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,  			range_end = zero_to_size;  		ret = ocfs2_zero_extend_range(inode, range_start, -					      range_end); +					      range_end, di_bh);  		if (ret) {  			mlog_errno(ret);  			break; @@ -1146,14 +1166,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)  		goto bail_unlock_rw;  	} -	if (size_change && attr->ia_size != i_size_read(inode)) { +	if (size_change) {  		status = inode_newsize_ok(inode, attr->ia_size);  		if (status)  			goto bail_unlock;  		inode_dio_wait(inode); -		if (i_size_read(inode) > attr->ia_size) { +		if (i_size_read(inode) >= attr->ia_size) {  			if (ocfs2_should_order_data(inode)) {  				status = ocfs2_begin_ordered_truncate(inode,  								      attr->ia_size); @@ -1237,7 +1257,7 @@ bail:  		dqput(transfer_to[qtype]);  	if (!status && attr->ia_valid & ATTR_MODE) { -		status = ocfs2_acl_chmod(inode); +		status = posix_acl_chmod(inode, inode->i_mode);  		if (status < 0)  			mlog_errno(status);  	} @@ -1323,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,  	di = (struct ocfs2_dinode *) bh->b_data;  	di->i_mode = cpu_to_le16(inode->i_mode); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_journal_dirty(handle, bh); @@ -1555,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,  		if (ret)  			mlog_errno(ret);  	} +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_commit_trans(osb, handle);  out: @@ -1870,7 +1892,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,  	}  	size = sr->l_start + sr->l_len; -	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { +	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || +	    cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {  		if (sr->l_len <= 0) {  			ret = -EINVAL;  			goto out_inode_unlock; @@ -2039,13 +2062,6 @@ out:  	return ret;  } -static void ocfs2_aiodio_wait(struct inode *inode) -{ -	wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - -	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); -} -  static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)  {  	int blockmask = inode->i_sb->s_blocksize - 1; @@ -2217,16 +2233,13 @@ out:  	return ret;  } -static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, -				    const struct iovec *iov, -				    unsigned long nr_segs, -				    loff_t pos) +static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, +				    struct iov_iter *from)  {  	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;  	int can_do_direct, has_refcount = 0;  	ssize_t written = 0; -	size_t ocount;		/* original count */ -	size_t count;		/* after file limit checks */ +	size_t count = iov_iter_count(from);  	loff_t old_size, *ppos = &iocb->ki_pos;  	u32 old_clusters;  	struct file *file = iocb->ki_filp; @@ -2240,7 +2253,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,  		(unsigned long long)OCFS2_I(inode)->ip_blkno,  		file->f_path.dentry->d_name.len,  		file->f_path.dentry->d_name.name, -		(unsigned int)nr_segs); +		(unsigned int)from->nr_segs);	/* GRRRRR */  	if (iocb->ki_nbytes == 0)  		return 0; @@ -2323,10 +2336,8 @@ relock:  		 * Wait on previous unaligned aio to complete before  		 * proceeding.  		 */ -		ocfs2_aiodio_wait(inode); - -		/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ -		atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); +		mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); +		/* Mark the iocb as needing an unlock in ocfs2_dio_end_io */  		ocfs2_iocb_set_unaligned_aio(iocb);  	} @@ -2340,28 +2351,23 @@ relock:  	/* communicate with ocfs2_dio_end_io */  	ocfs2_iocb_set_rw_locked(iocb, rw_level); -	ret = generic_segment_checks(iov, &nr_segs, &ocount, -				     VERIFY_READ); -	if (ret) -		goto out_dio; - -	count = ocount;  	ret = generic_write_checks(file, ppos, &count,  				   S_ISBLK(inode->i_mode));  	if (ret)  		goto out_dio; +	iov_iter_truncate(from, count);  	if (direct_io) { -		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, -						    ppos, count, ocount); +		written = generic_file_direct_write(iocb, from, *ppos);  		if (written < 0) {  			ret = written;  			goto out_dio;  		}  	} else {  		current->backing_dev_info = file->f_mapping->backing_dev_info; -		written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, -						      ppos, count, 0); +		written = generic_perform_write(file, from, *ppos); +		if (likely(written >= 0)) +			iocb->ki_pos = *ppos + written;  		current->backing_dev_info = NULL;  	} @@ -2371,8 +2377,8 @@ out_dio:  	if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||  	    ((file->f_flags & O_DIRECT) && !direct_io)) { -		ret = filemap_fdatawrite_range(file->f_mapping, pos, -					       pos + count - 1); +		ret = filemap_fdatawrite_range(file->f_mapping, *ppos, +					       *ppos + count - 1);  		if (ret < 0)  			written = ret; @@ -2385,8 +2391,8 @@ out_dio:  		}  		if (!ret) -			ret = filemap_fdatawait_range(file->f_mapping, pos, -						      pos + count - 1); +			ret = filemap_fdatawait_range(file->f_mapping, *ppos, +						      *ppos + count - 1);  	}  	/* @@ -2406,7 +2412,7 @@ out_dio:  	if (unaligned_dio) {  		ocfs2_iocb_clear_unaligned_aio(iocb); -		atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); +		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);  	}  out: @@ -2424,84 +2430,6 @@ out_sems:  	return ret;  } -static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, -				struct file *out, -				struct splice_desc *sd) -{ -	int ret; - -	ret = ocfs2_prepare_inode_for_write(out, &sd->pos, -					    sd->total_len, 0, NULL, NULL); -	if (ret < 0) { -		mlog_errno(ret); -		return ret; -	} - -	return splice_from_pipe_feed(pipe, sd, pipe_to_file); -} - -static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, -				       struct file *out, -				       loff_t *ppos, -				       size_t len, -				       unsigned int flags) -{ -	int ret; -	struct address_space *mapping = out->f_mapping; -	struct inode *inode = mapping->host; -	struct splice_desc sd = { -		.total_len = len, -		.flags = flags, -		.pos = *ppos, -		.u.file = out, -	}; - - -	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, -			(unsigned long long)OCFS2_I(inode)->ip_blkno, -			out->f_path.dentry->d_name.len, -			out->f_path.dentry->d_name.name, len); - -	pipe_lock(pipe); - -	splice_from_pipe_begin(&sd); -	do { -		ret = splice_from_pipe_next(pipe, &sd); -		if (ret <= 0) -			break; - -		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); -		ret = ocfs2_rw_lock(inode, 1); -		if (ret < 0) -			mlog_errno(ret); -		else { -			ret = ocfs2_splice_to_file(pipe, out, &sd); -			ocfs2_rw_unlock(inode, 1); -		} -		mutex_unlock(&inode->i_mutex); -	} while (ret > 0); -	splice_from_pipe_end(pipe, &sd); - -	pipe_unlock(pipe); - -	if (sd.num_spliced) -		ret = sd.num_spliced; - -	if (ret > 0) { -		int err; - -		err = generic_write_sync(out, *ppos, ret); -		if (err) -			ret = err; -		else -			*ppos += ret; - -		balance_dirty_pages_ratelimited(mapping); -	} - -	return ret; -} -  static ssize_t ocfs2_file_splice_read(struct file *in,  				      loff_t *ppos,  				      struct pipe_inode_info *pipe, @@ -2517,7 +2445,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,  			in->f_path.dentry->d_name.name, len);  	/* -	 * See the comment in ocfs2_file_aio_read() +	 * See the comment in ocfs2_file_read_iter()  	 */  	ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);  	if (ret < 0) { @@ -2532,10 +2460,8 @@ bail:  	return ret;  } -static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, -				   const struct iovec *iov, -				   unsigned long nr_segs, -				   loff_t pos) +static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, +				   struct iov_iter *to)  {  	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;  	struct file *filp = iocb->ki_filp; @@ -2544,7 +2470,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,  	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,  			(unsigned long long)OCFS2_I(inode)->ip_blkno,  			filp->f_path.dentry->d_name.len, -			filp->f_path.dentry->d_name.name, nr_segs); +			filp->f_path.dentry->d_name.name, +			to->nr_segs);	/* GRRRRR */  	if (!inode) { @@ -2589,13 +2516,13 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,  	}  	ocfs2_inode_unlock(inode, lock_level); -	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); +	ret = generic_file_read_iter(iocb, to);  	trace_generic_file_aio_read_ret(ret);  	/* buffered aio wouldn't have proper lock coverage today */  	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); -	/* see ocfs2_file_aio_write */ +	/* see ocfs2_file_write_iter */  	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {  		rw_level = -1;  		have_alloc_sem = 0; @@ -2623,7 +2550,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)  	case SEEK_SET:  		break;  	case SEEK_END: -		offset += inode->i_size; +		/* SEEK_END requires the OCFS2 inode lock for the file +		 * because it references the file's size. +		 */ +		ret = ocfs2_inode_lock(inode, NULL, 0); +		if (ret < 0) { +			mlog_errno(ret); +			goto out; +		} +		offset += i_size_read(inode); +		ocfs2_inode_unlock(inode, 0);  		break;  	case SEEK_CUR:  		if (offset == 0) { @@ -2662,6 +2598,7 @@ const struct inode_operations ocfs2_file_iops = {  	.removexattr	= generic_removexattr,  	.fiemap		= ocfs2_fiemap,  	.get_acl	= ocfs2_iop_get_acl, +	.set_acl	= ocfs2_iop_set_acl,  };  const struct inode_operations ocfs2_special_file_iops = { @@ -2669,6 +2606,7 @@ const struct inode_operations ocfs2_special_file_iops = {  	.getattr	= ocfs2_getattr,  	.permission	= ocfs2_permission,  	.get_acl	= ocfs2_iop_get_acl, +	.set_acl	= ocfs2_iop_set_acl,  };  /* @@ -2677,14 +2615,14 @@ const struct inode_operations ocfs2_special_file_iops = {   */  const struct file_operations ocfs2_fops = {  	.llseek		= ocfs2_file_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, +	.read		= new_sync_read, +	.write		= new_sync_write,  	.mmap		= ocfs2_mmap,  	.fsync		= ocfs2_sync_file,  	.release	= ocfs2_file_release,  	.open		= ocfs2_file_open, -	.aio_read	= ocfs2_file_aio_read, -	.aio_write	= ocfs2_file_aio_write, +	.read_iter	= ocfs2_file_read_iter, +	.write_iter	= ocfs2_file_write_iter,  	.unlocked_ioctl	= ocfs2_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl   = ocfs2_compat_ioctl, @@ -2692,7 +2630,7 @@ const struct file_operations ocfs2_fops = {  	.lock		= ocfs2_lock,  	.flock		= ocfs2_flock,  	.splice_read	= ocfs2_file_splice_read, -	.splice_write	= ocfs2_file_splice_write, +	.splice_write	= iter_file_splice_write,  	.fallocate	= ocfs2_fallocate,  }; @@ -2725,21 +2663,21 @@ const struct file_operations ocfs2_dops = {   */  const struct file_operations ocfs2_fops_no_plocks = {  	.llseek		= ocfs2_file_llseek, -	.read		= do_sync_read, -	.write		= do_sync_write, +	.read		= new_sync_read, +	.write		= new_sync_write,  	.mmap		= ocfs2_mmap,  	.fsync		= ocfs2_sync_file,  	.release	= ocfs2_file_release,  	.open		= ocfs2_file_open, -	.aio_read	= ocfs2_file_aio_read, -	.aio_write	= ocfs2_file_aio_write, +	.read_iter	= ocfs2_file_read_iter, +	.write_iter	= ocfs2_file_write_iter,  	.unlocked_ioctl	= ocfs2_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl   = ocfs2_compat_ioctl,  #endif  	.flock		= ocfs2_flock,  	.splice_read	= ocfs2_file_splice_read, -	.splice_write	= ocfs2_file_splice_write, +	.splice_write	= iter_file_splice_write,  	.fallocate	= ocfs2_fallocate,  }; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f87f9bd1edf..437de7f768c 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,  	struct inode *inode = NULL;  	struct super_block *sb = osb->sb;  	struct ocfs2_find_inode_args args; +	journal_t *journal = OCFS2_SB(sb)->journal->j_journal;  	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,  			       sysfile_type); @@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,  		goto bail;  	} +	/* +	 * Set transaction id's of transactions that have to be committed +	 * to finish f[data]sync. We set them to currently running transaction +	 * as we cannot be sure that the inode or some of its metadata isn't +	 * part of the transaction - the inode could have been reclaimed and +	 * now it is reread from disk. +	 */ +	if (journal) { +		transaction_t *transaction; +		tid_t tid; +		struct ocfs2_inode_info *oi = OCFS2_I(inode); + +		read_lock(&journal->j_state_lock); +		if (journal->j_running_transaction) +			transaction = journal->j_running_transaction; +		else +			transaction = journal->j_committing_transaction; +		if (transaction) +			tid = transaction->t_tid; +		else +			tid = journal->j_commit_sequence; +		read_unlock(&journal->j_state_lock); +		oi->i_sync_tid = tid; +		oi->i_datasync_tid = tid; +	} +  bail:  	if (!IS_ERR(inode)) {  		trace_ocfs2_iget_end(inode,  @@ -386,19 +413,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,  	u32 generation = 0;  	status = -EINVAL; -	if (inode == NULL || inode->i_sb == NULL) { -		mlog(ML_ERROR, "bad inode\n"); -		return status; -	}  	sb = inode->i_sb;  	osb = OCFS2_SB(sb); -	if (!args) { -		mlog(ML_ERROR, "bad inode args\n"); -		make_bad_inode(inode); -		return status; -	} -  	/*  	 * To improve performance of cold-cache inode stats, we take  	 * the cluster lock here if possible. @@ -814,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)  		goto bail;  	} -	/* If we're coming from downconvert_thread we can't go into our own -	 * voting [hello, deadlock city!], so unforuntately we just -	 * have to skip deleting this guy. That's OK though because -	 * the node who's doing the actual deleting should handle it -	 * anyway. */ +	/* +	 * If we're coming from downconvert_thread we can't go into our own +	 * voting [hello, deadlock city!] so we cannot delete the inode. But +	 * since we dropped last inode ref when downconverting dentry lock, +	 * we cannot have the file open and thus the node doing unlink will +	 * take care of deleting the inode. +	 */  	if (current == osb->dc_task)  		goto bail; @@ -832,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)  		goto bail_unlock;  	} -	/* If we have allowd wipe of this inode for another node, it -	 * will be marked here so we can safely skip it. Recovery will -	 * cleanup any inodes we might inadvertently skip here. */ -	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) -		goto bail_unlock; -  	ret = 1;  bail_unlock:  	spin_unlock(&oi->ip_lock); @@ -951,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,  		(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);  	if (sync_data)  		filemap_write_and_wait(inode->i_mapping); -	truncate_inode_pages(&inode->i_data, 0); +	truncate_inode_pages_final(&inode->i_data);  }  static void ocfs2_delete_inode(struct inode *inode) @@ -970,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode)  	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)  		goto bail; -	dquot_initialize(inode); -  	if (!ocfs2_inode_is_valid_to_delete(inode)) {  		/* It's probably not necessary to truncate_inode_pages  		 * here but we do it for safety anyway (it will most @@ -980,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode)  		goto bail;  	} +	dquot_initialize(inode); +  	/* We want to block signals in delete_inode as the lock and  	 * messaging paths may return us -ERESTARTSYS. Which would  	 * cause us to exit early, resulting in inodes being orphaned @@ -1067,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode)  {  	int status;  	struct ocfs2_inode_info *oi = OCFS2_I(inode); +	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);  	clear_inode(inode);  	trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, @@ -1083,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode)  	/* Do these before all the other work so that we don't bounce  	 * the downconvert thread while waiting to destroy the locks. */ -	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); -	ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); -	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); +	ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); +	ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); +	ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);  	ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,  			   &oi->ip_la_data_resv); @@ -1167,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode)  	    (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {  		ocfs2_delete_inode(inode);  	} else { -		truncate_inode_pages(&inode->i_data, 0); +		truncate_inode_pages_final(&inode->i_data);  	}  	ocfs2_clear_inode(inode);  } @@ -1270,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,  	fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);  	ocfs2_journal_dirty(handle, bh); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  leave:  	return status;  } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23..a6c991c0fc9 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -44,7 +44,7 @@ struct ocfs2_inode_info  	struct rw_semaphore		ip_xattr_sem;  	/* Number of outstanding AIO's which are not page aligned */ -	atomic_t			ip_unaligned_aio; +	struct mutex			ip_unaligned_aio;  	/* These fields are protected by ip_lock */  	spinlock_t			ip_lock; @@ -73,6 +73,13 @@ struct ocfs2_inode_info  	u32				ip_dir_lock_gen;  	struct ocfs2_alloc_reservation	ip_la_data_resv; + +	/* +	 * Transactions that contain inode's metadata needed to complete +	 * fsync and fdatasync, respectively. +	 */ +	tid_t i_sync_tid; +	tid_t i_datasync_tid;  };  /* @@ -84,8 +91,6 @@ struct ocfs2_inode_info  #define OCFS2_INODE_BITMAP		0x00000004  /* This inode has been wiped from disk */  #define OCFS2_INODE_DELETED		0x00000008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE		0x00000010  /* Has the inode been orphaned on another node?   *   * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -100,11 +105,11 @@ struct ocfs2_inode_info   * rely on ocfs2_delete_inode to sort things out under the proper   * cluster locks.   */ -#define OCFS2_INODE_MAYBE_ORPHANED	0x00000020 +#define OCFS2_INODE_MAYBE_ORPHANED	0x00000010  /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT		0x00000040 +#define OCFS2_INODE_OPEN_DIRECT		0x00000020  /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000040  static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)  { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fa32ce9b455..6f66b3751ac 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@  #include <linux/fs.h>  #include <linux/mount.h> +#include <linux/blkdev.h>  #include <linux/compat.h>  #include <cluster/masklog.h> @@ -142,8 +143,8 @@ bail:  	return status;  } -int ocfs2_info_handle_blocksize(struct inode *inode, -				struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_blocksize(struct inode *inode, +				       struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_blocksize oib; @@ -166,8 +167,8 @@ bail:  	return status;  } -int ocfs2_info_handle_clustersize(struct inode *inode, -				  struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_clustersize(struct inode *inode, +					 struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_clustersize oic; @@ -191,8 +192,8 @@ bail:  	return status;  } -int ocfs2_info_handle_maxslots(struct inode *inode, -			       struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_maxslots(struct inode *inode, +				      struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_maxslots oim; @@ -216,8 +217,8 @@ bail:  	return status;  } -int ocfs2_info_handle_label(struct inode *inode, -			    struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_label(struct inode *inode, +				   struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_label oil; @@ -241,8 +242,8 @@ bail:  	return status;  } -int ocfs2_info_handle_uuid(struct inode *inode, -			   struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_uuid(struct inode *inode, +				  struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_uuid oiu; @@ -266,8 +267,8 @@ bail:  	return status;  } -int ocfs2_info_handle_fs_features(struct inode *inode, -				  struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_fs_features(struct inode *inode, +					 struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_fs_features oif; @@ -293,8 +294,8 @@ bail:  	return status;  } -int ocfs2_info_handle_journal_size(struct inode *inode, -				   struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_journal_size(struct inode *inode, +					  struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_journal_size oij; @@ -318,9 +319,10 @@ bail:  	return status;  } -int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, -				struct inode *inode_alloc, u64 blkno, -				struct ocfs2_info_freeinode *fi, u32 slot) +static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, +				       struct inode *inode_alloc, u64 blkno, +				       struct ocfs2_info_freeinode *fi, +				       u32 slot)  {  	int status = 0, unlock = 0; @@ -365,8 +367,8 @@ bail:  	return status;  } -int ocfs2_info_handle_freeinode(struct inode *inode, -				struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freeinode(struct inode *inode, +				       struct ocfs2_info_request __user *req)  {  	u32 i;  	u64 blkno = -1; @@ -412,11 +414,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode,  		}  		status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); -		if (status < 0) -			goto bail;  		iput(inode_alloc);  		inode_alloc = NULL; + +		if (status < 0) +			goto bail;  	}  	o2info_set_request_filled(&oifi->ifi_req); @@ -460,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,  	stats->ffs_free_chunks_real++;  } -void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, -			   unsigned int chunksize) +static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, +				  unsigned int chunksize)  {  	o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);  	o2ffg_update_stats(&(ffg->iff_ffs), chunksize);  } -int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, -				   struct inode *gb_inode, -				   struct ocfs2_dinode *gb_dinode, -				   struct ocfs2_chain_rec *rec, -				   struct ocfs2_info_freefrag *ffg, -				   u32 chunks_in_group) +static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, +					  struct inode *gb_inode, +					  struct ocfs2_dinode *gb_dinode, +					  struct ocfs2_chain_rec *rec, +					  struct ocfs2_info_freefrag *ffg, +					  u32 chunks_in_group)  {  	int status = 0, used;  	u64 blkno; @@ -570,9 +573,9 @@ bail:  	return status;  } -int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, -				    struct inode *gb_inode, u64 blkno, -				    struct ocfs2_info_freefrag *ffg) +static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, +					   struct inode *gb_inode, u64 blkno, +					   struct ocfs2_info_freefrag *ffg)  {  	u32 chunks_in_group;  	int status = 0, unlock = 0, i; @@ -650,8 +653,8 @@ bail:  	return status;  } -int ocfs2_info_handle_freefrag(struct inode *inode, -			       struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freefrag(struct inode *inode, +				      struct ocfs2_info_request __user *req)  {  	u64 blkno = -1;  	char namebuf[40]; @@ -721,8 +724,8 @@ out_err:  	return status;  } -int ocfs2_info_handle_unknown(struct inode *inode, -			      struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_unknown(struct inode *inode, +				     struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_request oir; @@ -750,8 +753,8 @@ bail:   * - distinguish different requests.   * - validate size of different requests.   */ -int ocfs2_info_handle_request(struct inode *inode, -			      struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_request(struct inode *inode, +				     struct ocfs2_info_request __user *req)  {  	int status = -EFAULT;  	struct ocfs2_info_request oir; @@ -809,8 +812,8 @@ bail:  	return status;  } -int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, -			  u64 *req_addr, int compat_flag) +static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, +				 u64 *req_addr, int compat_flag)  {  	int status = -EFAULT;  	u64 __user *bp = NULL; @@ -847,8 +850,8 @@ bail:   * a better backward&forward compatibility, since a small piece of   * request will be less likely to be broken if disk layout get changed.   */ -int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, -		      int compat_flag) +static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, +			     int compat_flag)  {  	int i, status = 0;  	u64 req_addr; @@ -966,15 +969,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  	case FITRIM:  	{  		struct super_block *sb = inode->i_sb; +		struct request_queue *q = bdev_get_queue(sb->s_bdev);  		struct fstrim_range range;  		int ret = 0;  		if (!capable(CAP_SYS_ADMIN))  			return -EPERM; +		if (!blk_queue_discard(q)) +			return -EOPNOTSUPP; +  		if (copy_from_user(&range, argp, sizeof(range)))  			return -EFAULT; +		range.minlen = max_t(u64, q->limits.discard_granularity, +				     range.minlen);  		ret = ocfs2_trim_fs(sb, &range);  		if (ret < 0)  			return ret; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3..4b0c68849b3 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -30,6 +30,7 @@  #include <linux/kthread.h>  #include <linux/time.h>  #include <linux/random.h> +#include <linux/delay.h>  #include <cluster/masklog.h> @@ -2132,12 +2133,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,  		iter = oi->ip_next_orphan;  		spin_lock(&oi->ip_lock); -		/* The remote delete code may have set these on the -		 * assumption that the other node would wipe them -		 * successfully.  If they are still in the node's -		 * orphan dir, we need to reset that state. */ -		oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); -  		/* Set the proper information to get us going into  		 * ocfs2_delete_inode. */  		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; @@ -2191,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)  					 || kthread_should_stop());  		status = ocfs2_commit_cache(osb); -		if (status < 0) -			mlog_errno(status); +		if (status < 0) { +			static unsigned long abort_warn_time; + +			/* Warn about this once per minute */ +			if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) +				mlog(ML_ERROR, "status = %d, journal is " +						"already aborted.\n", status); +			/* +			 * After ocfs2_commit_cache() fails, j_num_trans has a +			 * non-zero value.  Sleep here to avoid a busy-wait +			 * loop. +			 */ +			msleep_interruptible(1000); +		}  		if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){  			mlog(ML_KTHREAD, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 0b479bab367..7f8cde94abf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -524,8 +524,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)   * the result may be wrong.   */  static inline int ocfs2_calc_extend_credits(struct super_block *sb, -					    struct ocfs2_extent_list *root_el, -					    u32 bits_wanted) +					    struct ocfs2_extent_list *root_el)  {  	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; @@ -627,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,  				new_size);  } +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, +						  struct inode *inode, +						  int datasync) +{ +	struct ocfs2_inode_info *oi = OCFS2_I(inode); + +	oi->i_sync_tid = handle->h_transaction->t_tid; +	if (datasync) +		oi->i_datasync_tid = handle->h_transaction->t_tid; +} +  #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index cd5496b7a0a..04401345562 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -781,6 +781,48 @@ bail:  	return status;  } +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, +				handle_t *handle, +				struct ocfs2_alloc_context *ac, +				u32 bit_off, +				u32 num_bits) +{ +	int status, start; +	u32 clear_bits; +	struct inode *local_alloc_inode; +	void *bitmap; +	struct ocfs2_dinode *alloc; +	struct ocfs2_local_alloc *la; + +	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + +	local_alloc_inode = ac->ac_inode; +	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; +	la = OCFS2_LOCAL_ALLOC(alloc); + +	bitmap = la->la_bitmap; +	start = bit_off - le32_to_cpu(la->la_bm_off); +	clear_bits = num_bits; + +	status = ocfs2_journal_access_di(handle, +			INODE_CACHE(local_alloc_inode), +			osb->local_alloc_bh, +			OCFS2_JOURNAL_ACCESS_WRITE); +	if (status < 0) { +		mlog_errno(status); +		goto bail; +	} + +	while (clear_bits--) +		ocfs2_clear_bit(start++, bitmap); + +	le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits); +	ocfs2_journal_dirty(handle, osb->local_alloc_bh); + +bail: +	return status; +} +  static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)  {  	u32 count; diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 1be9b586446..44a7d1fb2de 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,  				 u32 *bit_off,  				 u32 *num_bits); +int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb, +				handle_t *handle, +				struct ocfs2_alloc_context *ac, +				u32 bit_off, +				u32 num_bits); +  void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,  				      unsigned int num_clusters);  void ocfs2_la_enable_worker(struct work_struct *work); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index e57c804069e..6b6d092b099 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,  	}  	ret = flock_lock_file_wait(file, fl); +	if (ret) +		ocfs2_file_unlock(file);  out:  	mutex_unlock(&fp->fp_mutex); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 3d3f3c83065..599eb4c4c8b 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle,  							old_blkno, len);  	} +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  out:  	ocfs2_free_path(path);  	return ret; @@ -201,8 +202,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,  		}  	} -	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, -					      clusters_to_move + 2); +	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);  	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",  	     extra_blocks, clusters_to_move, *credits); @@ -562,83 +562,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,  	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);  } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, -				       handle_t *handle, -				       struct buffer_head *di_bh, -				       u32 num_bits, -				       u16 chain) -{ -	int ret; -	u32 tmp_used; -	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; -	struct ocfs2_chain_list *cl = -				(struct ocfs2_chain_list *) &di->id2.i_chain; - -	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, -				      OCFS2_JOURNAL_ACCESS_WRITE); -	if (ret < 0) { -		mlog_errno(ret); -		goto out; -	} - -	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); -	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); -	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); -	ocfs2_journal_dirty(handle, di_bh); - -out: -	return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, -					     struct inode *alloc_inode, -					     struct ocfs2_group_desc *bg, -					     struct buffer_head *group_bh, -					     unsigned int bit_off, -					     unsigned int num_bits) -{ -	int status; -	void *bitmap = bg->bg_bitmap; -	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - -	/* All callers get the descriptor via -	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */ -	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); -	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - -	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, -	     num_bits); - -	if (ocfs2_is_cluster_bitmap(alloc_inode)) -		journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - -	status = ocfs2_journal_access_gd(handle, -					 INODE_CACHE(alloc_inode), -					 group_bh, -					 journal_type); -	if (status < 0) { -		mlog_errno(status); -		goto bail; -	} - -	le16_add_cpu(&bg->bg_free_bits_count, -num_bits); -	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { -		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" -			    " count %u but claims %u are freed. num_bits %d", -			    (unsigned long long)le64_to_cpu(bg->bg_blkno), -			    le16_to_cpu(bg->bg_bits), -			    le16_to_cpu(bg->bg_free_bits_count), num_bits); -		return -EROFS; -	} -	while (num_bits--) -		ocfs2_set_bit(bit_off++, bitmap); - -	ocfs2_journal_dirty(handle, group_bh); - -bail: -	return status; -} -  static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,  			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,  			     u32 len, int ext_flags) @@ -768,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,  	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,  					 goal_bit, len); -	if (ret) +	if (ret) { +		ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, +					       le16_to_cpu(gd->bg_chain));  		mlog_errno(ret); +	}  	/*  	 * Here we should write the new page out first if we are @@ -1035,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)  	inode->i_ctime = CURRENT_TIME;  	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);  	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_journal_dirty(handle, di_bh); @@ -1067,8 +994,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)  	if (status)  		return status; -	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) +	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { +		status = -EPERM;  		goto out_drop; +	}  	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {  		status = -EPERM; @@ -1090,8 +1019,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)  		goto out_free;  	} -	if (range.me_start > i_size_read(inode)) +	if (range.me_start > i_size_read(inode)) { +		status = -EINVAL;  		goto out_free; +	}  	if (range.me_start + range.me_len > i_size_read(inode))  			range.me_len = i_size_read(inode) - range.me_start; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index be3f8676a43..8add6f1030d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)  	return inode;  } +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, +		struct dentry *dentry, struct inode *inode) +{ +	struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + +	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); +	ocfs2_lock_res_free(&dl->dl_lockres); +	BUG_ON(dl->dl_count != 1); +	spin_lock(&dentry_attach_lock); +	dentry->d_fsdata = NULL; +	spin_unlock(&dentry_attach_lock); +	kfree(dl); +	iput(inode); +} +  static int ocfs2_mknod(struct inode *dir,  		       struct dentry *dentry,  		       umode_t mode, @@ -230,6 +245,8 @@ static int ocfs2_mknod(struct inode *dir,  	struct ocfs2_dir_lookup_result lookup = { NULL, };  	sigset_t oldset;  	int did_block_signals = 0; +	struct posix_acl *default_acl = NULL, *acl = NULL; +	struct ocfs2_dentry_lock *dl = NULL;  	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,  			  (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -331,6 +348,12 @@ static int ocfs2_mknod(struct inode *dir,  		goto leave;  	} +	status = posix_acl_create(dir, &mode, &default_acl, &acl); +	if (status) { +		mlog_errno(status); +		goto leave; +	} +  	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,  							    S_ISDIR(mode),  							    xattr_credits)); @@ -379,8 +402,17 @@ static int ocfs2_mknod(struct inode *dir,  		inc_nlink(dir);  	} -	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, -				meta_ac, data_ac); +	if (default_acl) { +		status = ocfs2_set_acl(handle, inode, new_fe_bh, +				       ACL_TYPE_DEFAULT, default_acl, +				       meta_ac, data_ac); +	} +	if (!status && acl) { +		status = ocfs2_set_acl(handle, inode, new_fe_bh, +				       ACL_TYPE_ACCESS, acl, +				       meta_ac, data_ac); +	} +  	if (status < 0) {  		mlog_errno(status);  		goto leave; @@ -407,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,  		goto leave;  	} +	dl = dentry->d_fsdata; +  	status = ocfs2_add_entry(handle, dentry, inode,  				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,  				 &lookup); @@ -419,6 +453,10 @@ static int ocfs2_mknod(struct inode *dir,  	d_instantiate(dentry, inode);  	status = 0;  leave: +	if (default_acl) +		posix_acl_release(default_acl); +	if (acl) +		posix_acl_release(acl);  	if (status < 0 && did_quota_inode)  		dquot_free_inode(inode);  	if (handle) @@ -430,7 +468,6 @@ leave:  	brelse(new_fe_bh);  	brelse(parent_fe_bh); -	kfree(si.name);  	kfree(si.value);  	ocfs2_free_dir_lookup_result(&lookup); @@ -450,6 +487,9 @@ leave:  	 * ocfs2_delete_inode will mutex_lock again.  	 */  	if ((status < 0) && inode) { +		if (dl) +			ocfs2_cleanup_add_entry_failure(osb, dentry, inode); +  		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;  		clear_nlink(inode);  		iput(inode); @@ -475,6 +515,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,  	struct ocfs2_dinode *fe = NULL;  	struct ocfs2_extent_list *fel;  	u16 feat; +	struct ocfs2_inode_info *oi = OCFS2_I(inode);  	*new_fe_bh = NULL; @@ -489,7 +530,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,  	*new_fe_bh = sb_getblk(osb->sb, fe_blkno);  	if (!*new_fe_bh) { -		status = -EIO; +		status = -ENOMEM;  		mlog_errno(status);  		goto leave;  	} @@ -556,8 +597,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,  			mlog_errno(status);  	} -	status = 0; /* error in ocfs2_create_new_inode_locks is not -		     * critical */ +	oi->i_sync_tid = handle->h_transaction->t_tid; +	oi->i_datasync_tid = handle->h_transaction->t_tid;  leave:  	if (status < 0) { @@ -644,6 +685,7 @@ static int ocfs2_link(struct dentry *old_dentry,  	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);  	struct ocfs2_dir_lookup_result lookup = { NULL, };  	sigset_t oldset; +	u64 old_de_ino;  	trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,  			 old_dentry->d_name.len, old_dentry->d_name.name, @@ -666,6 +708,22 @@ static int ocfs2_link(struct dentry *old_dentry,  		goto out;  	} +	err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, +			old_dentry->d_name.len, &old_de_ino); +	if (err) { +		err = -ENOENT; +		goto out; +	} + +	/* +	 * Check whether another node removed the source inode while we +	 * were in the vfs. +	 */ +	if (old_de_ino != OCFS2_I(inode)->ip_blkno) { +		err = -ENOENT; +		goto out; +	} +  	err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,  					dentry->d_name.len);  	if (err) @@ -948,12 +1006,71 @@ leave:  	ocfs2_free_dir_lookup_result(&orphan_insert);  	ocfs2_free_dir_lookup_result(&lookup); -	if (status && (status != -ENOTEMPTY)) +	if (status && (status != -ENOTEMPTY) && (status != -ENOENT))  		mlog_errno(status);  	return status;  } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, +		u64 src_inode_no, u64 dest_inode_no) +{ +	int ret = 0, i = 0; +	u64 parent_inode_no = 0; +	u64 child_inode_no = src_inode_no; +	struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 +	while (1) { +		child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); +		if (IS_ERR(child_inode)) { +			ret = PTR_ERR(child_inode); +			break; +		} + +		ret = ocfs2_inode_lock(child_inode, NULL, 0); +		if (ret < 0) { +			iput(child_inode); +			if (ret != -ENOENT) +				mlog_errno(ret); +			break; +		} + +		ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, +				&parent_inode_no); +		ocfs2_inode_unlock(child_inode, 0); +		iput(child_inode); +		if (ret < 0) { +			ret = -ENOENT; +			break; +		} + +		if (parent_inode_no == dest_inode_no) { +			ret = 1; +			break; +		} + +		if (parent_inode_no == osb->root_inode->i_ino) { +			ret = 0; +			break; +		} + +		child_inode_no = parent_inode_no; + +		if (++i >= MAX_LOOKUP_TIMES) { +			mlog(ML_NOTICE, "max lookup times reached, filesystem " +					"may have nested directories, " +					"src inode: %llu, dest inode: %llu.\n", +					(unsigned long long)src_inode_no, +					(unsigned long long)dest_inode_no); +			ret = 0; +			break; +		} +	} + +	return ret; +} +  /*   * The only place this should be used is rename!   * if they have the same id, then the 1st one is the only one locked. @@ -965,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,  			     struct inode *inode2)  {  	int status; +	int inode1_is_ancestor, inode2_is_ancestor;  	struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);  	struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);  	struct buffer_head **tmpbh; @@ -978,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,  	if (*bh2)  		*bh2 = NULL; -	/* we always want to lock the one with the lower lockid first. */ +	/* we always want to lock the one with the lower lockid first. +	 * and if they are nested, we lock ancestor first */  	if (oi1->ip_blkno != oi2->ip_blkno) { -		if (oi1->ip_blkno < oi2->ip_blkno) { +		inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, +				oi1->ip_blkno); +		if (inode1_is_ancestor < 0) { +			status = inode1_is_ancestor; +			goto bail; +		} + +		inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, +				oi2->ip_blkno); +		if (inode2_is_ancestor < 0) { +			status = inode2_is_ancestor; +			goto bail; +		} + +		if ((inode1_is_ancestor == 1) || +				(oi1->ip_blkno < oi2->ip_blkno && +				inode2_is_ancestor == 0)) {  			/* switch id1 and id2 around */  			tmpbh = bh2;  			bh2 = bh1; @@ -1061,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir,  	struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };  	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };  	struct ocfs2_dir_lookup_result target_insert = { NULL, }; +	bool should_add_orphan = false;  	/* At some point it might be nice to break this function up a  	 * bit. */ @@ -1097,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,  			goto bail;  		}  		rename_lock = 1; + +		/* here we cannot guarantee the inodes haven't just been +		 * changed, so check if they are nested again */ +		status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, +				old_inode->i_ino); +		if (status < 0) { +			mlog_errno(status); +			goto bail; +		} else if (status == 1) { +			status = -EPERM; +			trace_ocfs2_rename_not_permitted( +					(unsigned long long)old_inode->i_ino, +					(unsigned long long)new_dir->i_ino); +			goto bail; +		}  	}  	/* if old and new are the same, this'll just do one lock. */ @@ -1267,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,  				mlog_errno(status);  				goto bail;  			} +			should_add_orphan = true;  		}  	} else {  		BUG_ON(new_dentry->d_parent->d_inode != new_dir); @@ -1311,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,  			goto bail;  		} -		if (S_ISDIR(new_inode->i_mode) || -		    (ocfs2_read_links_count(newfe) == 1)) { -			status = ocfs2_orphan_add(osb, handle, new_inode, -						  newfe_bh, orphan_name, -						  &orphan_insert, orphan_dir); -			if (status < 0) { -				mlog_errno(status); -				goto bail; -			} -		} -  		/* change the dirent to point to the correct inode */  		status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,  					    old_inode); @@ -1336,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir,  		else  			ocfs2_add_links_count(newfe, -1);  		ocfs2_journal_dirty(handle, newfe_bh); +		if (should_add_orphan) { +			status = ocfs2_orphan_add(osb, handle, new_inode, +					newfe_bh, orphan_name, +					&orphan_insert, orphan_dir); +			if (status < 0) { +				mlog_errno(status); +				goto bail; +			} +		}  	} else {  		/* if the name was not found in new_dir, add it now */  		status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1605,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,  	struct ocfs2_dir_lookup_result lookup = { NULL, };  	sigset_t oldset;  	int did_block_signals = 0; +	struct ocfs2_dentry_lock *dl = NULL;  	trace_ocfs2_symlink_begin(dir, dentry, symname,  				  dentry->d_name.len, dentry->d_name.name); @@ -1793,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,  		goto bail;  	} +	dl = dentry->d_fsdata; +  	status = ocfs2_add_entry(handle, dentry, inode,  				 le64_to_cpu(fe->i_blkno), parent_fe_bh,  				 &lookup); @@ -1818,7 +1971,6 @@ bail:  	brelse(new_fe_bh);  	brelse(parent_fe_bh); -	kfree(si.name);  	kfree(si.value);  	ocfs2_free_dir_lookup_result(&lookup);  	if (inode_ac) @@ -1828,6 +1980,9 @@ bail:  	if (xattr_ac)  		ocfs2_free_alloc_context(xattr_ac);  	if ((status < 0) && inode) { +		if (dl) +			ocfs2_cleanup_add_entry_failure(osb, dentry, inode); +  		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;  		clear_nlink(inode);  		iput(inode); @@ -2444,6 +2599,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,  	di->i_orphaned_slot = 0;  	set_nlink(inode, 1);  	ocfs2_set_links_count(di, inode->i_nlink); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_journal_dirty(handle, di_bh);  	status = ocfs2_add_entry(handle, dentry, inode, @@ -2504,4 +2660,5 @@ const struct inode_operations ocfs2_dir_iops = {  	.removexattr	= generic_removexattr,  	.fiemap         = ocfs2_fiemap,  	.get_acl	= ocfs2_iop_get_acl, +	.set_acl	= ocfs2_iop_set_acl,  }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a903470c79..bbec539230f 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@  #include <linux/sched.h>  #include <linux/wait.h>  #include <linux/list.h> +#include <linux/llist.h>  #include <linux/rbtree.h>  #include <linux/workqueue.h>  #include <linux/kref.h> @@ -274,19 +275,16 @@ enum ocfs2_mount_options  	OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */  }; -#define OCFS2_OSB_SOFT_RO			0x0001 -#define OCFS2_OSB_HARD_RO			0x0002 -#define OCFS2_OSB_ERROR_FS			0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED	0x0008 - -#define OCFS2_DEFAULT_ATIME_QUANTUM		60 +#define OCFS2_OSB_SOFT_RO	0x0001 +#define OCFS2_OSB_HARD_RO	0x0002 +#define OCFS2_OSB_ERROR_FS	0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM	60  struct ocfs2_journal;  struct ocfs2_slot_info;  struct ocfs2_recovery_map;  struct ocfs2_replay_map;  struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock;  struct ocfs2_super  {  	struct task_struct *commit_task; @@ -387,6 +385,7 @@ struct ocfs2_super  	u8 osb_stackflags;  	char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; +	char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];  	struct ocfs2_cluster_connection *cconn;  	struct ocfs2_lock_res osb_super_lockres;  	struct ocfs2_lock_res osb_rename_lockres; @@ -413,10 +412,9 @@ struct ocfs2_super  	struct list_head blocked_lock_list;  	unsigned long blocked_lock_count; -	/* List of dentry locks to release. Anyone can add locks to -	 * the list, ocfs2_wq processes the list  */ -	struct ocfs2_dentry_lock *dentry_lock_list; -	struct work_struct dentry_lock_work; +	/* List of dquot structures to drop last reference to */ +	struct llist_head dquot_drop_list; +	struct work_struct dquot_drop_work;  	wait_queue_head_t		osb_mount_event; @@ -424,6 +422,7 @@ struct ocfs2_super  	struct inode			*osb_tl_inode;  	struct buffer_head		*osb_tl_bh;  	struct delayed_work		osb_truncate_log_wq; +	atomic_t			osb_tl_disable;  	/*  	 * How many clusters in our truncate log.  	 * It must be protected by osb_tl_inode->i_mutex. @@ -448,6 +447,8 @@ struct ocfs2_super  	/* rb tree root for refcount lock. */  	struct rb_root	osb_rf_lock_tree;  	struct ocfs2_refcount_tree *osb_ref_tree_lru; + +	struct mutex system_file_mutex;  };  #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info) @@ -578,18 +579,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,  	spin_unlock(&osb->osb_lock);  } - -static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb, -						 unsigned long flag) -{ -	unsigned long ret; - -	spin_lock(&osb->osb_lock); -	ret = osb->osb_flags & flag; -	spin_unlock(&osb->osb_lock); -	return ret; -} -  static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,  				     int hard)  { diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d..6cb019b7c6a 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,  		  __entry->new_len, __get_str(new_name))  ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); +  TRACE_EVENT(ocfs2_rename_target_exists,  	TP_PROTO(int new_len, const char *new_name),  	TP_ARGS(new_len, new_name), diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c..f266d67df3c 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot {  	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */  	s64 dq_origspace;	/* Last globally synced space usage */  	s64 dq_originodes;	/* Last globally synced inode usage */ +	struct llist_node list;	/* Member of list of dquots to drop */  };  /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,  int ocfs2_create_local_dquot(struct dquot *dquot);  int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);  int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work);  extern const struct dquot_operations ocfs2_quota_operations;  extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aaa50611ec6..b990a62cff5 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@  #include <linux/jiffies.h>  #include <linux/writeback.h>  #include <linux/workqueue.h> +#include <linux/llist.h>  #include <cluster/masklog.h> @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)  	       OCFS2_INODE_UPDATE_CREDITS;  } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ +	struct ocfs2_super *osb = container_of(work, struct ocfs2_super, +					       dquot_drop_work); +	struct llist_node *list; +	struct ocfs2_dquot *odquot, *next_odquot; + +	list = llist_del_all(&osb->dquot_drop_list); +	llist_for_each_entry_safe(odquot, next_odquot, list, list) { +		/* Drop the reference we acquired in ocfs2_dquot_release() */ +		dqput(&odquot->dq_dquot); +	} +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */  static int ocfs2_release_dquot(struct dquot *dquot)  {  	handle_t *handle; @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)  	/* Check whether we are not racing with some other dqget() */  	if (atomic_read(&dquot->dq_count) > 1)  		goto out; +	/* Running from downconvert thread? Postpone quota processing to wq */ +	if (current == osb->dc_task) { +		/* +		 * Grab our own reference to dquot and queue it for delayed +		 * dropping.  Quota code rechecks after calling +		 * ->release_dquot() and won't free dquot structure. +		 */ +		dqgrab(dquot); +		/* First entry on list -> queue work */ +		if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) +			queue_work(ocfs2_wq, &osb->dquot_drop_work); +		goto out; +	}  	status = ocfs2_lock_global_qf(oinfo, 1);  	if (status < 0)  		goto out; @@ -717,6 +752,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)  	 */  	if (status < 0)  		mlog_errno(status); +	/* +	 * Clear dq_off so that we search for the structure in quota file next +	 * time we acquire it. The structure might be deleted and reallocated +	 * elsewhere by another node while our dquot structure is on freelist. +	 */ +	dquot->dq_off = 0;  	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);  out_trans:  	ocfs2_commit_trans(osb, handle); @@ -756,16 +797,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)  	status = ocfs2_lock_global_qf(info, 1);  	if (status < 0)  		goto out; -	if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { -		status = ocfs2_qinfo_lock(info, 0); -		if (status < 0) -			goto out_dq; -		status = qtree_read_dquot(&info->dqi_gi, dquot); -		ocfs2_qinfo_unlock(info, 0); -		if (status < 0) -			goto out_dq; -	} -	set_bit(DQ_READ_B, &dquot->dq_flags); +	status = ocfs2_qinfo_lock(info, 0); +	if (status < 0) +		goto out_dq; +	/* +	 * We always want to read dquot structure from disk because we don't +	 * know what happened with it while it was on freelist. +	 */ +	status = qtree_read_dquot(&info->dqi_gi, dquot); +	ocfs2_qinfo_unlock(info, 0); +	if (status < 0) +		goto out_dq;  	OCFS2_DQUOT(dquot)->dq_use_count++;  	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2e4344be3b9..2001862bf2b 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)  	ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);  out: -	/* Clear the read bit so that next time someone uses this -	 * dquot he reads fresh info from disk and allocates local -	 * dquot structure */ -	clear_bit(DQ_READ_B, &dquot->dq_flags);  	return status;  } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index bf4dfc14bb2..636aab69ead 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -46,6 +46,7 @@  #include <linux/quotaops.h>  #include <linux/namei.h>  #include <linux/mount.h> +#include <linux/posix_acl.h>  struct ocfs2_cow_context {  	struct inode *inode; @@ -612,6 +613,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode,  	}  	new_bh = sb_getblk(inode->i_sb, first_blkno); +	if (!new_bh) { +		ret = -ENOMEM; +		mlog_errno(ret); +		goto out_commit; +	}  	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);  	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, @@ -1310,7 +1316,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,  	new_bh = sb_getblk(sb, blkno);  	if (new_bh == NULL) { -		ret = -EIO; +		ret = -ENOMEM;  		mlog_errno(ret);  		goto out;  	} @@ -1402,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size)  {  	struct ocfs2_refcount_rec *l = a, *r = b, tmp; -	tmp = *(struct ocfs2_refcount_rec *)l; -	*(struct ocfs2_refcount_rec *)l = -			*(struct ocfs2_refcount_rec *)r; -	*(struct ocfs2_refcount_rec *)r = tmp; +	tmp = *l; +	*l = *r; +	*r = tmp;  }  /* @@ -1561,7 +1566,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,  	new_bh = sb_getblk(sb, blkno);  	if (new_bh == NULL) { -		ret = -EIO; +		ret = -ENOMEM;  		mlog_errno(ret);  		goto out;  	} @@ -2502,8 +2507,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,  		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);  		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);  		*credits += ocfs2_calc_extend_credits(sb, -						      et.et_root_el, -						      ref_blocks); +						      et.et_root_el);  	} else {  		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;  		*meta_add += 1; @@ -2874,8 +2878,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,  		meta_add =  			ocfs2_extend_meta_needed(et->et_root_el); -	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el, -					      num_clusters + 2); +	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el);  	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,  					       p_cluster, num_clusters, @@ -3031,7 +3034,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,  	for (i = 0; i < blocks; i++, old_block++, new_block++) {  		new_bh = sb_getblk(osb->sb, new_block);  		if (new_bh == NULL) { -			ret = -EIO; +			ret = -ENOMEM;  			mlog_errno(ret);  			break;  		} @@ -3625,8 +3628,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,  		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);  		*credits += ocfs2_calc_extend_credits(inode->i_sb, -						      et.et_root_el, -						      ref_blocks); +						      et.et_root_el);  	}  out: @@ -4266,20 +4268,36 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,  	struct inode *inode = old_dentry->d_inode;  	struct buffer_head *old_bh = NULL;  	struct inode *new_orphan_inode = NULL; +	struct posix_acl *default_acl, *acl; +	umode_t mode;  	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))  		return -EOPNOTSUPP; -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, +	mode = inode->i_mode; +	error = posix_acl_create(dir, &mode, &default_acl, &acl); +	if (error) { +		mlog_errno(error); +		goto out; +	} + +	error = ocfs2_create_inode_in_orphan(dir, mode,  					     &new_orphan_inode);  	if (error) {  		mlog_errno(error);  		goto out;  	} +	error = ocfs2_rw_lock(inode, 1); +	if (error) { +		mlog_errno(error); +		goto out; +	} +  	error = ocfs2_inode_lock(inode, &old_bh, 1);  	if (error) {  		mlog_errno(error); +		ocfs2_rw_unlock(inode, 1);  		goto out;  	} @@ -4291,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,  	up_write(&OCFS2_I(inode)->ip_xattr_sem);  	ocfs2_inode_unlock(inode, 1); +	ocfs2_rw_unlock(inode, 1);  	brelse(old_bh);  	if (error) { @@ -4301,11 +4320,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,  	/* If the security isn't preserved, we need to re-initialize them. */  	if (!preserve) {  		error = ocfs2_init_security_and_acl(dir, new_orphan_inode, -						    &new_dentry->d_name); +						    &new_dentry->d_name, +						    default_acl, acl);  		if (error)  			mlog_errno(error);  	}  out: +	if (default_acl) +		posix_acl_release(default_acl); +	if (acl) +		posix_acl_release(acl);  	if (!error) {  		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,  						       new_dentry); diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index ec55add7604..d5da6f62414 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -53,8 +53,6 @@   */  static u16 ocfs2_calc_new_backup_super(struct inode *inode,  				       struct ocfs2_group_desc *gd, -				       int new_clusters, -				       u32 first_new_cluster,  				       u16 cl_cpg,  				       int set)  { @@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,  				     OCFS2_FEATURE_COMPAT_BACKUP_SB)) {  		backups = ocfs2_calc_new_backup_super(bm_inode,  						     group, -						     new_clusters, -						     first_new_cluster,  						     cl_cpg, 1);  		le16_add_cpu(&group->bg_free_bits_count, -1 * backups);  	} @@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,  	spin_lock(&OCFS2_I(bm_inode)->ip_lock);  	OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); -	le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); +	le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);  	spin_unlock(&OCFS2_I(bm_inode)->ip_lock);  	i_size_write(bm_inode, le64_to_cpu(fe->i_size)); @@ -167,8 +163,6 @@ out_rollback:  	if (ret < 0) {  		ocfs2_calc_new_backup_super(bm_inode,  					    group, -					    new_clusters, -					    first_new_cluster,  					    cl_cpg, 0);  		le16_add_cpu(&group->bg_free_bits_count, backups);  		le16_add_cpu(&group->bg_bits, -1 * num_bits); @@ -469,6 +463,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)  	struct ocfs2_chain_list *cl;  	struct ocfs2_chain_rec *cr;  	u16 cl_bpc; +	u64 bg_ptr;  	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))  		return -EROFS; @@ -513,7 +508,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)  	ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);  	if (ret) {  		mlog_errno(ret); -		goto out_unlock; +		goto out_free_group_bh;  	}  	trace_ocfs2_group_add((unsigned long long)input->group, @@ -523,7 +518,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)  	if (IS_ERR(handle)) {  		mlog_errno(PTR_ERR(handle));  		ret = -EINVAL; -		goto out_unlock; +		goto out_free_group_bh;  	}  	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); @@ -538,12 +533,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)  	}  	group = (struct ocfs2_group_desc *)group_bh->b_data; +	bg_ptr = le64_to_cpu(group->bg_next_group);  	group->bg_next_group = cr->c_blkno;  	ocfs2_journal_dirty(handle, group_bh);  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),  				      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);  	if (ret < 0) { +		group->bg_next_group = cpu_to_le64(bg_ptr);  		mlog_errno(ret);  		goto out_commit;  	} @@ -566,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)  	spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);  	OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); -	le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); +	le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);  	spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);  	i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); @@ -574,8 +571,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)  out_commit:  	ocfs2_commit_trans(osb, handle); -out_unlock: + +out_free_group_bh:  	brelse(group_bh); + +out_unlock:  	brelse(main_bm_bh);  	ocfs2_inode_unlock(main_bm_inode, 1); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f8930456..1724d43d3da 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)  	return 0;  } -static int o2cb_cluster_this_node(unsigned int *node) +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, +				  unsigned int *node)  {  	int node_num; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1e231..13a8537d8e8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -23,6 +23,7 @@  #include <linux/mutex.h>  #include <linux/slab.h>  #include <linux/reboot.h> +#include <linux/sched.h>  #include <asm/uaccess.h>  #include "stackglue.h" @@ -102,6 +103,12 @@  #define OCFS2_TEXT_UUID_LEN			32  #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN	2  #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN	8 +#define VERSION_LOCK				"version_lock" + +enum ocfs2_connection_type { +	WITH_CONTROLD, +	NO_CONTROLD +};  /*   * ocfs2_live_connection is refcounted because the filesystem and @@ -110,6 +117,13 @@  struct ocfs2_live_connection {  	struct list_head		oc_list;  	struct ocfs2_cluster_connection	*oc_conn; +	enum ocfs2_connection_type	oc_type; +	atomic_t                        oc_this_node; +	int                             oc_our_slot; +	struct dlm_lksb                 oc_version_lksb; +	char                            oc_lvb[DLM_LVB_LEN]; +	struct completion               oc_sync_wait; +	wait_queue_head_t		oc_wait;  };  struct ocfs2_control_private { @@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)   * mount path.  Since the VFS prevents multiple calls to   * fill_super(), we can't get dupes here.   */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, -				     struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, +				     struct ocfs2_live_connection *c)  {  	int rc = 0; -	struct ocfs2_live_connection *c; - -	c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); -	if (!c) -		return -ENOMEM;  	mutex_lock(&ocfs2_control_lock);  	c->oc_conn = conn; -	if (atomic_read(&ocfs2_control_opened)) +	if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))  		list_add(&c->oc_list, &ocfs2_live_connection_list);  	else {  		printk(KERN_ERR @@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,  	}  	mutex_unlock(&ocfs2_control_lock); - -	if (!rc) -		*c_ret = c; -	else -		kfree(c); -  	return rc;  } @@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,  	return 0;  } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ +	struct ocfs2_protocol_version *pv = +		(struct ocfs2_protocol_version *)lvb; +	/* +	 * ocfs2_protocol_version has two u8 variables, so we don't +	 * need any endian conversion. +	 */ +	ver->pv_major = pv->pv_major; +	ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ +	struct ocfs2_protocol_version *pv = +		(struct ocfs2_protocol_version *)lvb; +	/* +	 * ocfs2_protocol_version has two u8 variables, so we don't +	 * need any endian conversion. +	 */ +	pv->pv_major = ver->pv_major; +	pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ +	struct ocfs2_cluster_connection *conn = arg; +	struct ocfs2_live_connection *lc = conn->cc_private; +	complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, +		struct dlm_lksb *lksb, char *name) +{ +	int error; +	struct ocfs2_live_connection *lc = conn->cc_private; + +	error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); +	if (error) { +		printk(KERN_ERR "%s lkid %x error %d\n", +				name, lksb->sb_lkid, error); +		return error; +	} + +	wait_for_completion(&lc->oc_sync_wait); + +	if (lksb->sb_status != -DLM_EUNLOCK) { +		printk(KERN_ERR "%s lkid %x status %d\n", +				name, lksb->sb_lkid, lksb->sb_status); +		return -1; +	} +	return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, +		int mode, uint32_t flags, +		struct dlm_lksb *lksb, char *name) +{ +	int error, status; +	struct ocfs2_live_connection *lc = conn->cc_private; + +	error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, +			name, strlen(name), +			0, sync_wait_cb, conn, NULL); +	if (error) { +		printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", +				name, lksb->sb_lkid, flags, mode, error); +		return error; +	} + +	wait_for_completion(&lc->oc_sync_wait); + +	status = lksb->sb_status; + +	if (status && status != -EAGAIN) { +		printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", +				name, lksb->sb_lkid, flags, mode, status); +	} + +	return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, +		int flags) +{ +	struct ocfs2_live_connection *lc = conn->cc_private; +	return sync_lock(conn, mode, flags, +			&lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ +	struct ocfs2_live_connection *lc = conn->cc_private; +	return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + *    version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + *    taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ +	int ret; +	struct ocfs2_live_connection *lc = conn->cc_private; +	struct ocfs2_protocol_version pv; + +	running_proto.pv_major = +		ocfs2_user_plugin.sp_max_proto.pv_major; +	running_proto.pv_minor = +		ocfs2_user_plugin.sp_max_proto.pv_minor; + +	lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; +	ret = version_lock(conn, DLM_LOCK_EX, +			DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); +	if (!ret) { +		conn->cc_version.pv_major = running_proto.pv_major; +		conn->cc_version.pv_minor = running_proto.pv_minor; +		version_to_lvb(&running_proto, lc->oc_lvb); +		version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); +	} else if (ret == -EAGAIN) { +		ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); +		if (ret) +			goto out; +		lvb_to_version(lc->oc_lvb, &pv); + +		if ((pv.pv_major != running_proto.pv_major) || +				(pv.pv_minor > running_proto.pv_minor)) { +			ret = -EINVAL; +			goto out; +		} + +		conn->cc_version.pv_major = pv.pv_major; +		conn->cc_version.pv_minor = pv.pv_minor; +	} +out: +	return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ +	struct ocfs2_cluster_connection *conn = arg; +	printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", +			slot->nodeid, slot->slot); +	conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, +		int num_slots, int our_slot, +		uint32_t generation) +{ +	struct ocfs2_cluster_connection *conn = arg; +	struct ocfs2_live_connection *lc = conn->cc_private; +	int i; + +	for (i = 0; i < num_slots; i++) +		if (slots[i].slot == our_slot) { +			atomic_set(&lc->oc_this_node, slots[i].nodeid); +			break; +		} + +	lc->oc_our_slot = our_slot; +	wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { +	.recover_prep = user_recover_prep, +	.recover_slot = user_recover_slot, +	.recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ +	version_unlock(conn); +	dlm_release_lockspace(conn->cc_lockspace, 2); +	conn->cc_lockspace = NULL; +	ocfs2_live_connection_drop(conn->cc_private); +	conn->cc_private = NULL; +	return 0; +} +  static int user_cluster_connect(struct ocfs2_cluster_connection *conn)  {  	dlm_lockspace_t *fsdlm; -	struct ocfs2_live_connection *uninitialized_var(control); -	int rc = 0; +	struct ocfs2_live_connection *lc; +	int rc, ops_rv;  	BUG_ON(conn == NULL); -	rc = ocfs2_live_connection_new(conn, &control); +	lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); +	if (!lc) { +		rc = -ENOMEM; +		goto out; +	} + +	init_waitqueue_head(&lc->oc_wait); +	init_completion(&lc->oc_sync_wait); +	atomic_set(&lc->oc_this_node, 0); +	conn->cc_private = lc; +	lc->oc_type = NO_CONTROLD; + +	rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, +			       DLM_LSFL_FS, DLM_LVB_LEN, +			       &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); +	if (rc) +		goto out; + +	if (ops_rv == -EOPNOTSUPP) { +		lc->oc_type = WITH_CONTROLD; +		printk(KERN_NOTICE "ocfs2: You seem to be using an older " +				"version of dlm_controld and/or ocfs2-tools." +				" Please consider upgrading.\n"); +	} else if (ops_rv) { +		rc = ops_rv; +		goto out; +	} +	conn->cc_lockspace = fsdlm; + +	rc = ocfs2_live_connection_attach(conn, lc);  	if (rc)  		goto out; +	if (lc->oc_type == NO_CONTROLD) { +		rc = get_protocol_version(conn); +		if (rc) { +			printk(KERN_ERR "ocfs2: Could not determine" +					" locking version\n"); +			user_cluster_disconnect(conn); +			goto out; +		} +		wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); +	} +  	/*  	 * running_proto must have been set before we allowed any mounts  	 * to proceed. @@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)  	if (fs_protocol_compare(&running_proto, &conn->cc_version)) {  		printk(KERN_ERR  		       "Unable to mount with fs locking protocol version " -		       "%u.%u because the userspace control daemon has " -		       "negotiated %u.%u\n", +		       "%u.%u because negotiated protocol is %u.%u\n",  		       conn->cc_version.pv_major, conn->cc_version.pv_minor,  		       running_proto.pv_major, running_proto.pv_minor);  		rc = -EPROTO; -		ocfs2_live_connection_drop(control); -		goto out; -	} - -	rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, -			       NULL, NULL, NULL, &fsdlm); -	if (rc) { -		ocfs2_live_connection_drop(control); -		goto out; +		ocfs2_live_connection_drop(lc); +		lc = NULL;  	} -	conn->cc_private = control; -	conn->cc_lockspace = fsdlm;  out: +	if (rc && lc) +		kfree(lc);  	return rc;  } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ -	dlm_release_lockspace(conn->cc_lockspace, 2); -	conn->cc_lockspace = NULL; -	ocfs2_live_connection_drop(conn->cc_private); -	conn->cc_private = NULL; -	return 0; -} -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, +				  unsigned int *this_node)  {  	int rc; +	struct ocfs2_live_connection *lc = conn->cc_private; + +	if (lc->oc_type == WITH_CONTROLD) +		rc = ocfs2_control_get_this_node(); +	else if (lc->oc_type == NO_CONTROLD) +		rc = atomic_read(&lc->oc_this_node); +	else +		rc = -EINVAL; -	rc = ocfs2_control_get_this_node();  	if (rc < 0)  		return rc; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 39abf89697e..5d965e83bd4 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,  EXPORT_SYMBOL_GPL(ocfs2_plock);  int ocfs2_cluster_connect(const char *stack_name, +			  const char *cluster_name, +			  int cluster_name_len,  			  const char *group,  			  int grouplen,  			  struct ocfs2_locking_protocol *lproto, @@ -342,8 +344,12 @@ int ocfs2_cluster_connect(const char *stack_name,  		goto out;  	} -	memcpy(new_conn->cc_name, group, grouplen); +	strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);  	new_conn->cc_namelen = grouplen; +	if (cluster_name_len) +		strlcpy(new_conn->cc_cluster_name, cluster_name, +			CLUSTER_NAME_MAX + 1); +	new_conn->cc_cluster_name_len = cluster_name_len;  	new_conn->cc_recovery_handler = recovery_handler;  	new_conn->cc_recovery_data = recovery_data; @@ -386,8 +392,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,  	if (cluster_stack_name[0])  		stack_name = cluster_stack_name; -	return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, -				     recovery_handler, recovery_data, conn); +	return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, +				     lproto, recovery_handler, recovery_data, +				     conn);  }  EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); @@ -460,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)  }  EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); -int ocfs2_cluster_this_node(unsigned int *node) +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, +			    unsigned int *node)  { -	return active_stack->sp_ops->this_node(node); +	return active_stack->sp_ops->this_node(conn, node);  }  EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); @@ -488,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,  }  static struct kobj_attribute ocfs2_attr_max_locking_protocol = -	__ATTR(max_locking_protocol, S_IFREG | S_IRUGO, +	__ATTR(max_locking_protocol, S_IRUGO,  	       ocfs2_max_locking_protocol_show, NULL);  static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, @@ -520,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,  }  static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = -	__ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, +	__ATTR(loaded_cluster_plugins, S_IRUGO,  	       ocfs2_loaded_cluster_plugins_show, NULL);  static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, @@ -542,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,  }  static struct kobj_attribute ocfs2_attr_active_cluster_plugin = -	__ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, +	__ATTR(active_cluster_plugin, S_IRUGO,  	       ocfs2_active_cluster_plugin_show, NULL);  static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, @@ -591,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,  static struct kobj_attribute ocfs2_attr_cluster_stack = -	__ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, +	__ATTR(cluster_stack, S_IRUGO | S_IWUSR,  	       ocfs2_cluster_stack_show,  	       ocfs2_cluster_stack_store); + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, +					struct kobj_attribute *attr, +					char *buf) +{ +	return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = +	__ATTR(dlm_recover_callback_support, S_IRUGO, +	       ocfs2_dlm_recover_show, NULL); +  static struct attribute *ocfs2_attrs[] = {  	&ocfs2_attr_max_locking_protocol.attr,  	&ocfs2_attr_loaded_cluster_plugins.attr,  	&ocfs2_attr_active_cluster_plugin.attr,  	&ocfs2_attr_cluster_stack.attr, +	&ocfs2_attr_dlm_recover_support.attr,  	NULL,  }; @@ -643,7 +665,7 @@ error:  #define FS_OCFS2_NM		1 -static ctl_table ocfs2_nm_table[] = { +static struct ctl_table ocfs2_nm_table[] = {  	{  		.procname	= "hb_ctl_path",  		.data		= ocfs2_hb_ctl_path, @@ -654,7 +676,7 @@ static ctl_table ocfs2_nm_table[] = {  	{ }  }; -static ctl_table ocfs2_mod_table[] = { +static struct ctl_table ocfs2_mod_table[] = {  	{  		.procname	= "nm",  		.data		= NULL, @@ -665,7 +687,7 @@ static ctl_table ocfs2_mod_table[] = {  	{ }  }; -static ctl_table ocfs2_kern_table[] = { +static struct ctl_table ocfs2_kern_table[] = {  	{  		.procname	= "ocfs2",  		.data		= NULL, @@ -676,7 +698,7 @@ static ctl_table ocfs2_kern_table[] = {  	{ }  }; -static ctl_table ocfs2_root_table[] = { +static struct ctl_table ocfs2_root_table[] = {  	{  		.procname	= "fs",  		.data		= NULL, @@ -687,7 +709,7 @@ static ctl_table ocfs2_root_table[] = {  	{ }  }; -static struct ctl_table_header *ocfs2_table_header = NULL; +static struct ctl_table_header *ocfs2_table_header;  /* diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fdb8d0..66334a30cea 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -45,6 +45,9 @@ struct file_lock;   */  #define GROUP_NAME_MAX		64 +/* This shadows  OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX	16 +  /*   * ocfs2_protocol_version changes when ocfs2 does something different in @@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {   * locking compatibility.   */  struct ocfs2_cluster_connection { -	char cc_name[GROUP_NAME_MAX]; +	char cc_name[GROUP_NAME_MAX + 1];  	int cc_namelen; +	char cc_cluster_name[CLUSTER_NAME_MAX + 1]; +	int cc_cluster_name_len;  	struct ocfs2_protocol_version cc_version;  	struct ocfs2_locking_protocol *cc_proto;  	void (*cc_recovery_handler)(int node_num, void *recovery_data); @@ -152,7 +157,8 @@ struct ocfs2_stack_operations {  	 * ->this_node() returns the cluster's unique identifier for the  	 * local node.  	 */ -	int (*this_node)(unsigned int *node); +	int (*this_node)(struct ocfs2_cluster_connection *conn, +			 unsigned int *node);  	/*  	 * Call the underlying dlm lock function.  The ->dlm_lock() @@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {  /* Used by the filesystem */  int ocfs2_cluster_connect(const char *stack_name, +			  const char *cluster_name, +			  int cluster_name_len,  			  const char *group,  			  int grouplen,  			  struct ocfs2_locking_protocol *lproto, @@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,  int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,  			     int hangup_pending);  void ocfs2_cluster_hangup(const char *group, int grouplen); -int ocfs2_cluster_this_node(unsigned int *node); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, +			    unsigned int *node);  struct ocfs2_lock_res;  int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 5397c07ce60..0cb889a17ae 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,  				     struct ocfs2_suballoc_result *res);  static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  					 int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, -					     struct inode *alloc_inode, -					     struct ocfs2_group_desc *bg, -					     struct buffer_head *group_bh, -					     unsigned int bit_off, -					     unsigned int num_bits);  static int ocfs2_relink_block_group(handle_t *handle,  				    struct inode *alloc_inode,  				    struct buffer_head *fe_bh, @@ -481,7 +475,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,  	bg_bh = sb_getblk(osb->sb, bg_blkno);  	if (!bg_bh) { -		status = -EIO; +		status = -ENOMEM;  		mlog_errno(status);  		goto bail;  	} @@ -661,7 +655,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,  	bg_bh = sb_getblk(osb->sb, bg_blkno);  	if (!bg_bh) { -		status = -EIO; +		status = -ENOMEM;  		mlog_errno(status);  		goto bail;  	} @@ -777,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,  	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);  	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));  	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); +	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);  	status = 0; @@ -1343,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,  	return status;  } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle,  					     struct inode *alloc_inode,  					     struct ocfs2_group_desc *bg,  					     struct buffer_head *group_bh, @@ -1388,8 +1383,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,  	ocfs2_journal_dirty(handle, group_bh);  bail: -	if (status) -		mlog_errno(status);  	return status;  } @@ -1588,7 +1581,7 @@ static int ocfs2_block_group_search(struct inode *inode,  	return ret;  } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode,  				       handle_t *handle,  				       struct buffer_head *di_bh,  				       u32 num_bits, @@ -1615,6 +1608,21 @@ out:  	return ret;  } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, +				       struct buffer_head *di_bh, +				       u32 num_bits, +				       u16 chain) +{ +	u32 tmp_used; +	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; +	struct ocfs2_chain_list *cl; + +	cl = (struct ocfs2_chain_list *)&di->id2.i_chain; +	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); +	di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); +	le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} +  static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,  					 struct ocfs2_extent_rec *rec,  					 struct ocfs2_chain_list *cl) @@ -1715,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,  	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,  					 res->sr_bit_offset, res->sr_bits); -	if (ret < 0) +	if (ret < 0) { +		ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, +					       res->sr_bits, +					       le16_to_cpu(gd->bg_chain));  		mlog_errno(ret); +	}  out_loc_only:  	*bits_left = le16_to_cpu(gd->bg_free_bits_count); @@ -1846,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,  					    res->sr_bit_offset,  					    res->sr_bits);  	if (status < 0) { +		ocfs2_rollback_alloc_dinode_counts(alloc_inode, +					ac->ac_bh, res->sr_bits, chain);  		mlog_errno(status);  		goto bail;  	} @@ -2099,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,  	ac->ac_find_loc_priv = res;  	*fe_blkno = res->sr_blkno; - +	ocfs2_update_inode_fsync_trans(handle, dir, 0);  out:  	if (handle)  		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); @@ -2157,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,  					 res->sr_bit_offset,  					 res->sr_bits);  	if (ret < 0) { +		ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, +					       ac->ac_bh, res->sr_bits, chain);  		mlog_errno(ret);  		goto out;  	} @@ -2878,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)  	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);  	if (status < 0) {  		mutex_unlock(&inode_alloc_inode->i_mutex); +		iput(inode_alloc_inode);  		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",  		     (u32)suballoc_slot, status);  		goto bail; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa5091..2d2501767c0 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -86,6 +86,22 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,  			   u32 bits_wanted,  			   struct ocfs2_alloc_context **ac); +int ocfs2_alloc_dinode_update_counts(struct inode *inode, +			 handle_t *handle, +			 struct buffer_head *di_bh, +			 u32 num_bits, +			 u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, +			 struct buffer_head *di_bh, +			 u32 num_bits, +			 u16 chain); +int ocfs2_block_group_set_bits(handle_t *handle, +			 struct inode *alloc_inode, +			 struct ocfs2_group_desc *bg, +			 struct buffer_head *group_bh, +			 unsigned int bit_off, +			 unsigned int num_bits); +  int ocfs2_claim_metadata(handle_t *handle,  			 struct ocfs2_alloc_context *ac,  			 u32 bits_wanted, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d4e81e4a9b0..ddb662b3244 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,7 +68,6 @@  #include "super.h"  #include "sysfile.h"  #include "uptodate.h" -#include "ver.h"  #include "xattr.h"  #include "quota.h"  #include "refcounttree.h" @@ -76,7 +75,7 @@  #include "buffer_head_io.h" -static struct kmem_cache *ocfs2_inode_cachep = NULL; +static struct kmem_cache *ocfs2_inode_cachep;  struct kmem_cache *ocfs2_dquot_cachep;  struct kmem_cache *ocfs2_qf_chunk_cachep; @@ -86,10 +85,11 @@ struct kmem_cache *ocfs2_qf_chunk_cachep;   * workqueue and schedule on our own. */  struct workqueue_struct *ocfs2_wq = NULL; -static struct dentry *ocfs2_debugfs_root = NULL; +static struct dentry *ocfs2_debugfs_root;  MODULE_AUTHOR("Oracle");  MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system");  struct mount_options  { @@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)  	if (!oi)  		return NULL; +	oi->i_sync_tid = 0; +	oi->i_datasync_tid = 0; +  	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);  	return &oi->vfs_inode;  } @@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)  	struct ocfs2_super *osb = OCFS2_SB(sb);  	u32 tmp; +	sync_filesystem(sb); +  	if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||  	    !ocfs2_check_set_options(sb, &parsed_options)) {  		ret = -EINVAL; @@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type,  	return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);  } -static void ocfs2_kill_sb(struct super_block *sb) -{ -	struct ocfs2_super *osb = OCFS2_SB(sb); - -	/* Failed mount? */ -	if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) -		goto out; - -	/* Prevent further queueing of inode drop events */ -	spin_lock(&dentry_list_lock); -	ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); -	spin_unlock(&dentry_list_lock); -	/* Wait for work to finish and/or remove it */ -	cancel_work_sync(&osb->dentry_lock_work); -out: -	kill_block_super(sb); -} -  static struct file_system_type ocfs2_fs_type = {  	.owner          = THIS_MODULE,  	.name           = "ocfs2",  	.mount          = ocfs2_mount, -	.kill_sb        = ocfs2_kill_sb, - +	.kill_sb        = kill_block_super,  	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,  	.next           = NULL  }; @@ -1612,16 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)  	return 0;  } -wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; -  static int __init ocfs2_init(void)  { -	int status, i; - -	ocfs2_print_version(); - -	for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) -		init_waitqueue_head(&ocfs2__ioend_wq[i]); +	int status;  	status = init_ocfs2_uptodate_cache();  	if (status < 0) @@ -1763,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)  	ocfs2_extent_map_init(&oi->vfs_inode);  	INIT_LIST_HEAD(&oi->ip_io_markers);  	oi->ip_dir_start_lookup = 0; -	atomic_set(&oi->ip_unaligned_aio, 0); +	mutex_init(&oi->ip_unaligned_aio);  	init_rwsem(&oi->ip_alloc_sem);  	init_rwsem(&oi->ip_xattr_sem);  	mutex_init(&oi->ip_io_mutex); @@ -1848,8 +1827,8 @@ static int ocfs2_get_sector(struct super_block *sb,  	*bh = sb_getblk(sb, block);  	if (!*bh) { -		mlog_errno(-EIO); -		return -EIO; +		mlog_errno(-ENOMEM); +		return -ENOMEM;  	}  	lock_buffer(*bh);  	if (!buffer_dirty(*bh)) @@ -1934,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)  	debugfs_remove(osb->osb_ctxt); -	/* -	 * Flush inode dropping work queue so that deletes are -	 * performed while the filesystem is still working -	 */ -	ocfs2_drop_all_dl_inodes(osb); -  	/* Orphan scan should be stopped as early as possible */  	ocfs2_orphan_scan_stop(osb);  	ocfs2_disable_quotas(osb); +	/* All dquots should be freed by now */ +	WARN_ON(!llist_empty(&osb->dquot_drop_list)); +	/* Wait for worker to be done with the work structure in osb */ +	cancel_work_sync(&osb->dquot_drop_work); +  	ocfs2_shutdown_local_alloc(osb);  	ocfs2_truncate_log_shutdown(osb); @@ -2075,7 +2053,6 @@ static int ocfs2_initialize_super(struct super_block *sb,  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;  	struct inode *inode = NULL;  	struct ocfs2_journal *journal; -	__le32 uuid_net_key;  	struct ocfs2_super *osb;  	u64 total_blocks; @@ -2121,6 +2098,8 @@ static int ocfs2_initialize_super(struct super_block *sb,  	spin_lock_init(&osb->osb_xattr_lock);  	ocfs2_init_steal_slots(osb); +	mutex_init(&osb->system_file_mutex); +  	atomic_set(&osb->alloc_stats.moves, 0);  	atomic_set(&osb->alloc_stats.local_data, 0);  	atomic_set(&osb->alloc_stats.bitmap_data, 0); @@ -2225,10 +2204,9 @@ static int ocfs2_initialize_super(struct super_block *sb,  	if (ocfs2_clusterinfo_valid(osb)) {  		osb->osb_stackflags =  			OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; -		memcpy(osb->osb_cluster_stack, +		strlcpy(osb->osb_cluster_stack,  		       OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, -		       OCFS2_STACK_LABEL_LEN); -		osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; +		       OCFS2_STACK_LABEL_LEN + 1);  		if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {  			mlog(ML_ERROR,  			     "couldn't mount because of an invalid " @@ -2237,6 +2215,9 @@ static int ocfs2_initialize_super(struct super_block *sb,  			status = -EINVAL;  			goto bail;  		} +		strlcpy(osb->osb_cluster_name, +			OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, +			OCFS2_CLUSTER_NAME_LEN + 1);  	} else {  		/* The empty string is identical with classic tools that  		 * don't know about s_cluster_info. */ @@ -2272,8 +2253,8 @@ static int ocfs2_initialize_super(struct super_block *sb,  	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);  	journal->j_state = OCFS2_JOURNAL_FREE; -	INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); -	osb->dentry_lock_list = NULL; +	INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); +	init_llist_head(&osb->dquot_drop_list);  	/* get some pseudo constants for clustersize bits */  	osb->s_clustersize_bits = @@ -2307,10 +2288,8 @@ static int ocfs2_initialize_super(struct super_block *sb,  		goto bail;  	} -	memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - -	strncpy(osb->vol_label, di->id2.i_super.s_label, 63); -	osb->vol_label[63] = '\0'; +	strlcpy(osb->vol_label, di->id2.i_super.s_label, +		OCFS2_MAX_VOL_LABEL_LEN);  	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);  	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);  	osb->first_cluster_group_blkno = diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index f053688d22a..af155c18312 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,  	} else  		arr = get_local_system_inode(osb, type, slot); +	mutex_lock(&osb->system_file_mutex);  	if (arr && ((inode = *arr) != NULL)) {  		/* get a ref in addition to the array ref */  		inode = igrab(inode); +		mutex_unlock(&osb->system_file_mutex);  		BUG_ON(!inode);  		return inode; @@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,  		*arr = igrab(inode);  		BUG_ON(!*arr);  	} +	mutex_unlock(&osb->system_file_mutex);  	return inode;  } diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 52eaf33d346..82e17b076ce 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item {  	sector_t	c_block;  }; -static struct kmem_cache *ocfs2_uptodate_cachep = NULL; +static struct kmem_cache *ocfs2_uptodate_cachep;  u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)  { diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4128a..00000000000 --- a/fs/ocfs2/ver.c +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define OCFS2_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION - -void ocfs2_print_version(void) -{ -	printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb91d2..00000000000 --- a/fs/ocfs2/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2002, 2004 Oracle.  All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef OCFS2_VER_H -#define OCFS2_VER_H - -void ocfs2_print_version(void); - -#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 6ce0686eab7..016f01df382 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {  const struct xattr_handler *ocfs2_xattr_handlers[] = {  	&ocfs2_xattr_user_handler, -	&ocfs2_xattr_acl_access_handler, -	&ocfs2_xattr_acl_default_handler, +	&posix_acl_access_xattr_handler, +	&posix_acl_default_xattr_handler,  	&ocfs2_xattr_trusted_handler,  	&ocfs2_xattr_security_handler,  	NULL @@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = {  static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {  	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,  	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] -					= &ocfs2_xattr_acl_access_handler, +					= &posix_acl_access_xattr_handler,  	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] -					= &ocfs2_xattr_acl_default_handler, +					= &posix_acl_default_xattr_handler,  	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,  	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,  }; @@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)   * them fully.   */  static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, -				   u64 xb_blkno) +				   u64 xb_blkno, int new)  {  	int i, rc = 0; @@ -377,15 +377,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,  		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,  					      xb_blkno + i);  		if (!bucket->bu_bhs[i]) { -			rc = -EIO; +			rc = -ENOMEM;  			mlog_errno(rc);  			break;  		}  		if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), -					   bucket->bu_bhs[i])) -			ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), -						      bucket->bu_bhs[i]); +					   bucket->bu_bhs[i])) { +			if (new) +				ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), +							      bucket->bu_bhs[i]); +			else { +				set_buffer_uptodate(bucket->bu_bhs[i]); +				ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), +							  bucket->bu_bhs[i]); +			} +		}  	}  	if (rc) @@ -754,8 +761,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,  			BUG_ON(why == RESTART_META);  			credits = ocfs2_calc_extend_credits(inode->i_sb, -							    &vb->vb_xv->xr_list, -							    clusters_to_add); +							    &vb->vb_xv->xr_list);  			status = ocfs2_extend_trans(handle, credits);  			if (status < 0) {  				status = -ENOMEM; @@ -2603,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)  	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);  	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);  	spin_unlock(&oi->ip_lock); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  	ocfs2_journal_dirty(handle, di_bh);  out_commit: @@ -2865,6 +2872,12 @@ static int ocfs2_create_xattr_block(struct inode *inode,  	}  	new_bh = sb_getblk(inode->i_sb, first_blkno); +	if (!new_bh) { +		ret = -ENOMEM; +		mlog_errno(ret); +		goto end; +	} +  	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);  	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode), @@ -3040,8 +3053,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,  		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {  			clusters_add += new_clusters;  			credits += ocfs2_calc_extend_credits(inode->i_sb, -							&def_xv.xv.xr_list, -							new_clusters); +							&def_xv.xv.xr_list);  		}  		goto meta_guess; @@ -3106,8 +3118,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,  			if (!ocfs2_xattr_is_local(xe))  				credits += ocfs2_calc_extend_credits(  							inode->i_sb, -							&def_xv.xv.xr_list, -							new_clusters); +							&def_xv.xv.xr_list);  			goto out;  		}  	} @@ -3132,9 +3143,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,  			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);  			clusters_add += new_clusters - old_clusters;  			credits += ocfs2_calc_extend_credits(inode->i_sb, -							     &xv->xr_list, -							     new_clusters - -							     old_clusters); +							     &xv->xr_list);  			if (value_size >= OCFS2_XATTR_ROOT_SIZE)  				goto out;  		} @@ -3180,7 +3189,7 @@ meta_guess:  				 &xb->xb_attrs.xb_root.xt_list;  			meta_add += ocfs2_extend_meta_needed(el);  			credits += ocfs2_calc_extend_credits(inode->i_sb, -							     el, 1); +							     el);  		} else  			credits += OCFS2_SUBALLOC_ALLOC + 1; @@ -3199,8 +3208,15 @@ meta_guess:  			clusters_add += 1;  		}  	} else { -		meta_add += 1;  		credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; +		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { +			struct ocfs2_extent_list *el = &def_xv.xv.xr_list; +			meta_add += ocfs2_extend_meta_needed(el); +			credits += ocfs2_calc_extend_credits(inode->i_sb, +							     el); +		} else { +			meta_add += 1; +		}  	}  out:  	if (clusters_need) @@ -3613,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode,  	}  	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); +	ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);  	ocfs2_commit_trans(osb, ctxt.handle); @@ -4293,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,  	trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); -	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); +	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);  	if (ret) {  		mlog_errno(ret);  		goto out; @@ -4637,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,  	 * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,  	 * there's no need to read it.  	 */ -	ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); +	ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);  	if (ret) {  		mlog_errno(ret);  		goto out; @@ -4803,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,  	 * Even if !t_is_new, we're overwriting t_bucket.  Thus,  	 * there's no need to read it.  	 */ -	ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); +	ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);  	if (ret)  		goto out; @@ -5475,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,  	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);  	if (ret)  		mlog_errno(ret); +	ocfs2_update_inode_fsync_trans(handle, inode, 0);  out_commit:  	ocfs2_commit_trans(osb, handle); @@ -6216,8 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,  			  le16_to_cpu(xv->xr_list.l_next_free_rec);  		*credits += ocfs2_calc_extend_credits(sb, -						&def_xv.xv.xr_list, -						le32_to_cpu(xv->xr_clusters)); +						&def_xv.xv.xr_list);  		/*  		 * If the value is a tree with depth > 1, We don't go deep @@ -6782,7 +6799,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(  		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);  	*credits += ocfs2_calc_extend_credits(osb->sb, -					      xt_et->et_root_el, len); +					      xt_et->et_root_el);  	if (metas.num_metas) {  		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, @@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,  			break;  		} -		ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); +		ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);  		if (ret) {  			mlog_errno(ret);  			break; @@ -7190,10 +7207,12 @@ out:   */  int ocfs2_init_security_and_acl(struct inode *dir,  				struct inode *inode, -				const struct qstr *qstr) +				const struct qstr *qstr, +				struct posix_acl *default_acl, +				struct posix_acl *acl)  { -	int ret = 0;  	struct buffer_head *dir_bh = NULL; +	int ret = 0;  	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);  	if (ret) { @@ -7207,9 +7226,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,  		goto leave;  	} -	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); -	if (ret) -		mlog_errno(ret); +	if (!ret && default_acl) +		ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); +	if (!ret && acl) +		ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);  	ocfs2_inode_unlock(dir, 0);  	brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 19f134e896a..f10d5b93c36 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info {  extern const struct xattr_handler ocfs2_xattr_user_handler;  extern const struct xattr_handler ocfs2_xattr_trusted_handler;  extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler ocfs2_xattr_acl_access_handler; -extern const struct xattr_handler ocfs2_xattr_acl_default_handler;  extern const struct xattr_handler *ocfs2_xattr_handlers[];  ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); @@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,  			 bool preserve_security);  int ocfs2_init_security_and_acl(struct inode *dir,  				struct inode *inode, -				const struct qstr *qstr); +				const struct qstr *qstr, +				struct posix_acl *default_acl, +				struct posix_acl *acl);  #endif /* OCFS2_XATTR_H */  | 
